package MarkdownParser; use strict; use warnings; our $VERSION = '1.0'; my %CLOSING_TAGS = ( ulist => "", olist => "", blockquote => "", table => "", ); sub new { my $class = shift; return bless { state => 'paragraph', output => '', lines => [], }, $class; } sub _init_handlers { my $self = shift; $self->{handlers} = { code_block_start => sub { my ( $self, $line_type ) = @_; $self->transition_to_state('code_block'); $self->{output} .= "
\n";
        },
        header => sub {
            my ( $self, $line_type ) = @_;
            $self->transition_to_state('paragraph');
            my $level = length( $line_type->{match} );
            $self->{output} .=
                ""
              . $self->parse_inline( $line_type->{text} )
              . "\n";
        },
        blockquote => sub {
            my ( $self, $line_type ) = @_;
            $self->handle_list_or_blockquote( 'blockquote', '
', $line_type->{text} ); }, ulist => sub { my ( $self, $line_type ) = @_; $self->handle_list_or_blockquote( 'ulist', '
    ', $line_type->{text} ); }, olist => sub { my ( $self, $line_type ) = @_; $self->handle_list_or_blockquote( 'olist', '
      ', $line_type->{text} ); }, horizontal_rule => sub { my ( $self, $line_type ) = @_; $self->transition_to_state('paragraph'); $self->{output} .= "
      \n"; }, table_row => sub { my ( $self, $line_type ) = @_; if ( $self->{state} ne 'table' ) { $self->transition_to_state('table'); $self->{output} .= "\n"; $self->{table_is_header} = 1; } $self->handle_table_row( $line_type->{text} ); }, table_separator => sub { my ( $self, $line_type ) = @_; if ( $self->{state} eq 'table' ) { $self->{table_is_header} = 0; } }, blank => sub { my ( $self, $line_type ) = @_; $self->finish_state(); }, }; } sub parse { my ( $self, $text ) = @_; $self->{output} = ''; $self->{lines} = [ split /\r?\n/, $text ]; $self->{state} = 'paragraph'; delete $self->{paragraph_buffer}; $self->_init_handlers(); foreach my $line ( @{ $self->{lines} } ) { $self->process_line($line); } $self->finish_state(); return $self->{output}; } sub process_line { my ( $self, $line ) = @_; if ( $self->{state} eq 'code_block' ) { $self->handle_code_block_line($line); return; } if ( $self->{state} eq 'table' ) { $self->handle_table_line($line); return; } my $line_type = $self->detect_line_type($line); my $handler = $self->{handlers}->{ $line_type->{type} }; if ($handler) { $handler->( $self, $line_type ); } else { $self->handle_paragraph_line($line); } } sub detect_line_type { my ( $self, $line ) = @_; return { type => 'code_block_start' } if $line =~ /^```/; if ( $line =~ /^(#{1,6})\s+(.+)/ ) { return { type => 'header', match => $1, text => $2 }; } if ( $line =~ /^>\s+(.+)/ ) { return { type => 'blockquote', text => $1 }; } if ( $line =~ /^[-*+]\s+(.+)/ ) { return { type => 'ulist', text => $1 }; } if ( $line =~ /^\d+\.\s+(.+)/ ) { return { type => 'olist', text => $1 }; } return { type => 'horizontal_rule' } if $line =~ /^[-*_]{3,}$/; return { type => 'blank' } if $line =~ /^\s*$/; if ( $line =~ /^\|.+\|/ ) { if ( $line =~ /^\|[\s\-:]*\|/ ) { return { type => 'table_separator' }; } return { type => 'table_row', text => $line }; } return { type => 'paragraph' }; } sub transition_to_state { my ( $self, $new_state ) = @_; if ( $self->{state} ne $new_state ) { $self->finish_state(); $self->{state} = $new_state; } } sub handle_code_block_line { my ( $self, $line ) = @_; if ( $line =~ /^```/ ) { $self->{output} .= "\n"; $self->transition_to_state('paragraph'); } else { $self->{output} .= escape_html($line) . "\n"; } } sub handle_list_or_blockquote { my ( $self, $target_state, $open_tag, $text ) = @_; if ( $self->{state} ne $target_state ) { $self->transition_to_state($target_state); $self->{output} .= "$open_tag\n"; } my $inner_tag = $target_state eq 'blockquote' ? 'p' : 'li'; $self->{output} .= "<$inner_tag>" . $self->parse_inline($text) . "\n"; } sub handle_paragraph_line { my ( $self, $line ) = @_; if ( $self->{state} ne 'paragraph' ) { $self->transition_to_state('paragraph'); } $self->{paragraph_buffer} //= ''; $self->{paragraph_buffer} .= ( $self->{paragraph_buffer} ? ' ' : '' ) . $line; } sub handle_table_line { my ( $self, $line ) = @_; my $line_type = $self->detect_line_type($line); if ( $line_type->{type} eq 'table_separator' ) { $self->{table_is_header} = 0; return; } if ( $line_type->{type} eq 'table_row' ) { $self->handle_table_row( $line_type->{text} ); return; } $self->finish_state(); $self->process_line($line); } sub handle_table_row { my ( $self, $row ) = @_; $row =~ s/^\|\s*//; $row =~ s/\s*\|$//; my @cells = map { s/^\s+//; s/\s+$//; $_ } split( /\|/, $row ); $self->{output} .= "\n"; for my $cell (@cells) { my $tag = $self->{table_is_header} ? 'th' : 'td'; $self->{output} .= "<$tag>" . $self->parse_inline($cell) . "\n"; } $self->{output} .= "\n"; } sub finish_state { my $self = shift; if ( $self->{state} eq 'paragraph' && exists $self->{paragraph_buffer} && $self->{paragraph_buffer} =~ /\S/ ) { $self->{output} .= "

      " . $self->parse_inline( $self->{paragraph_buffer} ) . "

      \n"; delete $self->{paragraph_buffer}; } elsif ( exists $CLOSING_TAGS{ $self->{state} } ) { $self->{output} .= $CLOSING_TAGS{ $self->{state} } . "\n"; } $self->{state} = 'paragraph'; } sub parse_inline { my ( $self, $text ) = @_; my @placeholders; my $placeholder_idx = 0; $text =~ s/`([^`]+)`/sub { my $idx = $placeholder_idx++; push @placeholders, { type => 'code', content => $1 }; return "\x01$idx\x02"; }->()/ge; $text =~ s/!\[([^\]]*)\]\(((?:[^()]|\([^()]*\))+)\)/sub { my $idx = $placeholder_idx++; push @placeholders, { type => 'image', alt => $1, url => $2 }; return "\x01$idx\x02"; }->()/ge; $text =~ s/\[([^\]]+)\]\(((?:[^()]|\([^()]*\))+)\)/sub { my $idx = $placeholder_idx++; push @placeholders, { type => 'link', text => $1, url => $2 }; return "\x01$idx\x02"; }->()/ge; my @bold_parts; my $bold_idx = 0; $text =~ s/\*\*\*((?:[^*]|\*(?!\*))+)\*\*\*/sub { my $idx = $bold_idx++; push @bold_parts, { type => 'bold', content => $1 }; return "\x01B$idx\x02"; }->()/ge; $text =~ s/\*\*((?:[^*]|\*(?!\*))+)\*\*/sub { my $idx = $bold_idx++; push @bold_parts, { type => 'bold', content => $1 }; return "\x01B$idx\x02"; }->()/ge; $text =~ s/___((?:[^_]|_(?!_))+?)___/$1<\/strong>/g; $text =~ s/__((?:[^_]|_(?!_))+?)__/$1<\/strong>/g; my @italic_parts; my $italic_idx = 0; $text =~ s/\*([^*]+)\*/sub { my $idx = $italic_idx++; push @italic_parts, { type => 'italic', content => $1 }; return "\x01I$idx\x02"; }->()/ge; $text =~ s/_([^_]+)_/sub { my $idx = $italic_idx++; push @italic_parts, { type => 'italic', content => $1 }; return "\x01I$idx\x02"; }->()/ge; for ( my $i = 0 ; $i < @bold_parts ; $i++ ) { my $part = $bold_parts[$i]; my $content = $part->{content}; $content =~ s/\*([^*]+)\*/sub { my $idx = $italic_idx++; push @italic_parts, { type => 'italic', content => $1 }; return "\x01I$idx\x02"; }->()/ge; $content =~ s/_([^_]+)_/sub { my $idx = $italic_idx++; push @italic_parts, { type => 'italic', content => $1 }; return "\x01I$idx\x02"; }->()/ge; $text =~ s/\x01B$i\x02/$content<\/strong>/; } my @format_parts; my $format_idx = 0; $text =~ s/<(strong|em)>(.*?)<\/(strong|em)>/sub { my $idx = $format_idx++; push @format_parts, { tag => $1, content => $2 }; return "\x01F$idx\x02"; }->()/gse; $text = escape_html($text); for ( my $i = 0 ; $i < @format_parts ; $i++ ) { my $part = $format_parts[$i]; $text =~ s/\x01F$i\x02/<$part->{tag}>@{[escape_html($part->{content})]}<\/$part->{tag}>/; } for ( my $i = 0 ; $i < @italic_parts ; $i++ ) { my $part = $italic_parts[$i]; $text =~ s/\x01I$i\x02/@{[escape_html($part->{content})]}<\/em>/; } for ( my $i = 0 ; $i < @placeholders ; $i++ ) { my $part = $placeholders[$i]; my $replacement; if ( $part->{type} eq 'code' ) { $replacement = "" . escape_html( $part->{content} ) . ""; } elsif ( $part->{type} eq 'image' ) { if ( is_safe_url( $part->{url} ) ) { $replacement = "{url} ) . "\" alt=\"" . escape_html( $part->{alt} ) . "\">"; } else { $replacement = escape_html( $part->{alt} ); } } elsif ( $part->{type} eq 'link' ) { if ( is_safe_url( $part->{url} ) ) { $replacement = "{url} ) . "\">" . escape_html( $part->{text} ) . ""; } else { $replacement = escape_html( $part->{text} ); } } $text =~ s/\x01$i\x02/$replacement/; } return $text; } sub is_safe_url { my ($url) = @_; return 0 if $url =~ /^\s*javascript:/i; return 0 if $url =~ /^\s*data:/i; return 0 if $url =~ /^\s*vbscript:/i; return 0 if $url =~ /^\s*file:/i; return 1; } sub escape_html { my ($text) = @_; $text =~ s/&/&/g; $text =~ s//>/g; $text =~ s/"/"/g; $text =~ s/'/'/g; return $text; } 1;