From 2a7daa13fd41deefba37a752f8819672b69c9b16 Mon Sep 17 00:00:00 2001 From: Kharec Date: Wed, 12 Nov 2025 19:28:52 +0100 Subject: [PATCH] feat: main module --- lib/MarkdownParser.pm | 400 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 400 insertions(+) create mode 100644 lib/MarkdownParser.pm diff --git a/lib/MarkdownParser.pm b/lib/MarkdownParser.pm new file mode 100644 index 0000000..f034939 --- /dev/null +++ b/lib/MarkdownParser.pm @@ -0,0 +1,400 @@ +package MarkdownParser; + +use strict; +use warnings; + +our $VERSION = '1.0'; + +my %CLOSING_TAGS = ( + ulist => "", + olist => "", + blockquote => "", + table => "", +); + +sub new { + my $class = shift; + return bless { + state => 'paragraph', + output => '', + lines => [], + }, $class; +} + +sub _init_handlers { + my $self = shift; + $self->{handlers} = { + code_block_start => sub { + my ( $self, $line_type ) = @_; + $self->transition_to_state('code_block'); + $self->{output} .= "
\n";
+        },
+        header => sub {
+            my ( $self, $line_type ) = @_;
+            $self->transition_to_state('paragraph');
+            my $level = length( $line_type->{match} );
+            $self->{output} .=
+                ""
+              . $self->parse_inline( $line_type->{text} )
+              . "\n";
+        },
+        blockquote => sub {
+            my ( $self, $line_type ) = @_;
+            $self->handle_list_or_blockquote( 'blockquote', '
', + $line_type->{text} ); + }, + ulist => sub { + my ( $self, $line_type ) = @_; + $self->handle_list_or_blockquote( 'ulist', '
    ', + $line_type->{text} ); + }, + olist => sub { + my ( $self, $line_type ) = @_; + $self->handle_list_or_blockquote( 'olist', '
      ', + $line_type->{text} ); + }, + horizontal_rule => sub { + my ( $self, $line_type ) = @_; + $self->transition_to_state('paragraph'); + $self->{output} .= "
      \n"; + }, + table_row => sub { + my ( $self, $line_type ) = @_; + if ( $self->{state} ne 'table' ) { + $self->transition_to_state('table'); + $self->{output} .= "\n"; + $self->{table_is_header} = 1; + } + $self->handle_table_row( $line_type->{text} ); + }, + table_separator => sub { + my ( $self, $line_type ) = @_; + if ( $self->{state} eq 'table' ) { + $self->{table_is_header} = 0; + } + }, + blank => sub { + my ( $self, $line_type ) = @_; + $self->finish_state(); + }, + }; +} + +sub parse { + my ( $self, $text ) = @_; + $self->{output} = ''; + $self->{lines} = [ split /\r?\n/, $text ]; + $self->{state} = 'paragraph'; + delete $self->{paragraph_buffer}; + $self->_init_handlers(); + + foreach my $line ( @{ $self->{lines} } ) { + $self->process_line($line); + } + + $self->finish_state(); + return $self->{output}; +} + +sub process_line { + my ( $self, $line ) = @_; + + if ( $self->{state} eq 'code_block' ) { + $self->handle_code_block_line($line); + return; + } + + if ( $self->{state} eq 'table' ) { + $self->handle_table_line($line); + return; + } + + my $line_type = $self->detect_line_type($line); + my $handler = $self->{handlers}->{ $line_type->{type} }; + if ($handler) { + $handler->( $self, $line_type ); + } + else { + $self->handle_paragraph_line($line); + } +} + +sub detect_line_type { + my ( $self, $line ) = @_; + + return { type => 'code_block_start' } if $line =~ /^```/; + + if ( $line =~ /^(#{1,6})\s+(.+)/ ) { + return { type => 'header', match => $1, text => $2 }; + } + + if ( $line =~ /^>\s+(.+)/ ) { + return { type => 'blockquote', text => $1 }; + } + + if ( $line =~ /^[-*+]\s+(.+)/ ) { + return { type => 'ulist', text => $1 }; + } + + if ( $line =~ /^\d+\.\s+(.+)/ ) { + return { type => 'olist', text => $1 }; + } + + return { type => 'horizontal_rule' } if $line =~ /^[-*_]{3,}$/; + return { type => 'blank' } if $line =~ /^\s*$/; + + if ( $line =~ /^\|.+\|/ ) { + if ( $line =~ /^\|[\s\-:]*\|/ ) { + return { type => 'table_separator' }; + } + return { type => 'table_row', text => $line }; + } + + return { type => 'paragraph' }; +} + +sub transition_to_state { + my ( $self, $new_state ) = @_; + + if ( $self->{state} ne $new_state ) { + $self->finish_state(); + $self->{state} = $new_state; + } +} + +sub handle_code_block_line { + my ( $self, $line ) = @_; + + if ( $line =~ /^```/ ) { + $self->{output} .= "\n"; + $self->transition_to_state('paragraph'); + } + else { + $self->{output} .= escape_html($line) . "\n"; + } +} + +sub handle_list_or_blockquote { + my ( $self, $target_state, $open_tag, $text ) = @_; + + if ( $self->{state} ne $target_state ) { + $self->transition_to_state($target_state); + $self->{output} .= "$open_tag\n"; + } + + my $inner_tag = $target_state eq 'blockquote' ? 'p' : 'li'; + $self->{output} .= + "<$inner_tag>" . $self->parse_inline($text) . "\n"; +} + +sub handle_paragraph_line { + my ( $self, $line ) = @_; + + if ( $self->{state} ne 'paragraph' ) { + $self->transition_to_state('paragraph'); + } + + $self->{paragraph_buffer} //= ''; + $self->{paragraph_buffer} .= + ( $self->{paragraph_buffer} ? ' ' : '' ) . $line; +} + +sub handle_table_line { + my ( $self, $line ) = @_; + + my $line_type = $self->detect_line_type($line); + + if ( $line_type->{type} eq 'table_separator' ) { + $self->{table_is_header} = 0; + return; + } + + if ( $line_type->{type} eq 'table_row' ) { + $self->handle_table_row( $line_type->{text} ); + return; + } + + $self->finish_state(); + $self->process_line($line); +} + +sub handle_table_row { + my ( $self, $row ) = @_; + + $row =~ s/^\|\s*//; + $row =~ s/\s*\|$//; + + my @cells = map { s/^\s+//; s/\s+$//; $_ } split( /\|/, $row ); + + $self->{output} .= "\n"; + for my $cell (@cells) { + my $tag = $self->{table_is_header} ? 'th' : 'td'; + $self->{output} .= "<$tag>" . $self->parse_inline($cell) . "\n"; + } + $self->{output} .= "\n"; +} + +sub finish_state { + my $self = shift; + + if ( $self->{state} eq 'paragraph' + && exists $self->{paragraph_buffer} + && $self->{paragraph_buffer} =~ /\S/ ) + { + $self->{output} .= + "

      " . $self->parse_inline( $self->{paragraph_buffer} ) . "

      \n"; + delete $self->{paragraph_buffer}; + } + elsif ( exists $CLOSING_TAGS{ $self->{state} } ) { + $self->{output} .= $CLOSING_TAGS{ $self->{state} } . "\n"; + } + + $self->{state} = 'paragraph'; +} + +sub parse_inline { + my ( $self, $text ) = @_; + + my @placeholders; + my $placeholder_idx = 0; + + $text =~ s/`([^`]+)`/sub { + my $idx = $placeholder_idx++; + push @placeholders, { type => 'code', content => $1 }; + return "\x01$idx\x02"; + }->()/ge; + + $text =~ s/!\[([^\]]*)\]\(((?:[^()]|\([^()]*\))+)\)/sub { + my $idx = $placeholder_idx++; + push @placeholders, { type => 'image', alt => $1, url => $2 }; + return "\x01$idx\x02"; + }->()/ge; + + $text =~ s/\[([^\]]+)\]\(((?:[^()]|\([^()]*\))+)\)/sub { + my $idx = $placeholder_idx++; + push @placeholders, { type => 'link', text => $1, url => $2 }; + return "\x01$idx\x02"; + }->()/ge; + + my @bold_parts; + my $bold_idx = 0; + $text =~ s/\*\*\*((?:[^*]|\*(?!\*))+)\*\*\*/sub { + my $idx = $bold_idx++; + push @bold_parts, { type => 'bold', content => $1 }; + return "\x01B$idx\x02"; + }->()/ge; + $text =~ s/\*\*((?:[^*]|\*(?!\*))+)\*\*/sub { + my $idx = $bold_idx++; + push @bold_parts, { type => 'bold', content => $1 }; + return "\x01B$idx\x02"; + }->()/ge; + $text =~ s/___((?:[^_]|_(?!_))+?)___/$1<\/strong>/g; + $text =~ s/__((?:[^_]|_(?!_))+?)__/$1<\/strong>/g; + + my @italic_parts; + my $italic_idx = 0; + $text =~ s/\*([^*]+)\*/sub { + my $idx = $italic_idx++; + push @italic_parts, { type => 'italic', content => $1 }; + return "\x01I$idx\x02"; + }->()/ge; + $text =~ s/_([^_]+)_/sub { + my $idx = $italic_idx++; + push @italic_parts, { type => 'italic', content => $1 }; + return "\x01I$idx\x02"; + }->()/ge; + + for ( my $i = 0 ; $i < @bold_parts ; $i++ ) { + my $part = $bold_parts[$i]; + my $content = $part->{content}; + $content =~ s/\*([^*]+)\*/sub { + my $idx = $italic_idx++; + push @italic_parts, { type => 'italic', content => $1 }; + return "\x01I$idx\x02"; + }->()/ge; + $content =~ s/_([^_]+)_/sub { + my $idx = $italic_idx++; + push @italic_parts, { type => 'italic', content => $1 }; + return "\x01I$idx\x02"; + }->()/ge; + $text =~ s/\x01B$i\x02/$content<\/strong>/; + } + + my @format_parts; + my $format_idx = 0; + $text =~ s/<(strong|em)>(.*?)<\/(strong|em)>/sub { + my $idx = $format_idx++; + push @format_parts, { tag => $1, content => $2 }; + return "\x01F$idx\x02"; + }->()/gse; + + $text = escape_html($text); + + for ( my $i = 0 ; $i < @format_parts ; $i++ ) { + my $part = $format_parts[$i]; + $text =~ +s/\x01F$i\x02/<$part->{tag}>@{[escape_html($part->{content})]}<\/$part->{tag}>/; + } + + for ( my $i = 0 ; $i < @italic_parts ; $i++ ) { + my $part = $italic_parts[$i]; + $text =~ s/\x01I$i\x02/@{[escape_html($part->{content})]}<\/em>/; + } + + for ( my $i = 0 ; $i < @placeholders ; $i++ ) { + my $part = $placeholders[$i]; + my $replacement; + if ( $part->{type} eq 'code' ) { + $replacement = + "" . escape_html( $part->{content} ) . ""; + } + elsif ( $part->{type} eq 'image' ) { + if ( is_safe_url( $part->{url} ) ) { + $replacement = + "{url} ) + . "\" alt=\"" + . escape_html( $part->{alt} ) . "\">"; + } + else { + $replacement = escape_html( $part->{alt} ); + } + } + elsif ( $part->{type} eq 'link' ) { + if ( is_safe_url( $part->{url} ) ) { + $replacement = + "{url} ) . "\">" + . escape_html( $part->{text} ) . ""; + } + else { + $replacement = escape_html( $part->{text} ); + } + } + $text =~ s/\x01$i\x02/$replacement/; + } + + return $text; +} + +sub is_safe_url { + my ($url) = @_; + return 0 if $url =~ /^\s*javascript:/i; + return 0 if $url =~ /^\s*data:/i; + return 0 if $url =~ /^\s*vbscript:/i; + return 0 if $url =~ /^\s*file:/i; + return 1; +} + +sub escape_html { + my ($text) = @_; + $text =~ s/&/&/g; + $text =~ s//>/g; + $text =~ s/"/"/g; + $text =~ s/'/'/g; + return $text; +} + +1; +