test: add underscores-in-words regression tests for emphasis

feat: harden emphasis regexes to avoid matching inside words
test: add encoded protocol XSS regression tests for links/images
2025-12-11 16:38:20 +01:00 · 2025-12-11 16:37:46 +01:00 · 2025-12-11 15:58:07 +01:00 · 2025-12-11 15:57:25 +01:00 · 2025-12-11 14:36:53 +01:00 · 2025-11-23 11:36:08 +01:00
16 changed files with 1151 additions and 0 deletions
--- a/36
+++ b/36
@@ -0,0 +1,36 @@
+PREFIX ?= /usr/local
+BINDIR = $(PREFIX)/bin
+PERLLIB = $(PREFIX)/lib/perl5/site_perl
+SCRIPT = m2h.pl
+TARGET = m2h
+
+.PHONY: all install uninstall test clean tidy help
+
+all:
+
+install: install-bin install-lib
+
+install-bin:
+	@mkdir -p $(BINDIR)
+	@cp $(SCRIPT) $(BINDIR)/$(TARGET)
+	@chmod +x $(BINDIR)/$(TARGET)
+
+install-lib:
+	@mkdir -p $(PERLLIB)
+	@cp lib/MarkdownParser.pm $(PERLLIB)/
+
+uninstall:
+	@rm -f $(BINDIR)/$(TARGET)
+	@rm -f $(PERLLIB)/MarkdownParser.pm
+
+test:
+	@prove -lr t
+
+clean:
+	@rm -f *.html
+	@find . -name \*.bak -exec rm -fv {} \;
+
+tidy:
+	@perltidy -b $(SCRIPT) lib/MarkdownParser.pm t/*
+
+
--- a/README.md
+++ b/README.md
@@ -0,0 +1,110 @@
+# m2h
+
+A lightweight, pure Perl markdown to HTML converter that uses a state machine for parsing.
+
+## Features
+
+- Pure Perl implementation - no external dependencies
+- State machine-based parsing for efficient and maintainable code
+- Converts standard markdown syntax to HTML
+- Secure HTML output, especially against XSS or file protocols
+- Fast and lightweight
+
+## Requirements
+
+- Perl 5.42 or higher
+- Getopt::Long (but it's already installed on most Perl installations)
+
+## Installation
+
+No installation required. Simply download the script and make it executable:
+
+```bash
+chmod +x m2h.pl
+```
+
+## Usage
+
+### Basic Usage
+
+Convert a markdown file to HTML:
+
+```bash
+perl -Ilibm2h.pl input.md > output.html
+```
+
+Or read from stdin:
+
+```bash
+cat input.md | perl -Ilib m2h.pl > output.html
+```
+
+### Command Line Options
+
+```
+m2h [options] [file]
+
+Options:
+    -h, --help      Show this help message
+    -v, --version   Show version information
+    -o, --output    Specify output file (default: stdout)
+```
+
+Install the script using:
+
+```bash
+make install
+```
+
+Run the test suite using:
+
+```bash
+make test
+```
+
+## Supported Markdown Features
+
+- Headers (H1-H6)
+- Paragraphs
+- Bold and italic text
+- Links
+- Images
+- Lists (ordered and unordered)
+- Code blocks and inline code
+- Blockquotes
+- Horizontal rules
+- Tables
+
+## How It Works
+
+m2h uses a state machine to parse markdown text. The parser transitions between different states (e.g., paragraph, code block, list) based on the input it encounters, allowing for efficient and accurate parsing of markdown syntax.
+
+## Example
+
+### Input (markdown)
+
+```markdown
+# Hello World
+
+This is a **bold** and _italic_ example.
+
+- Item 1
+- Item 2
+- Item 3
+```
+
+### Output (HTML)
+
+```html
+<h1>Hello World</h1>
+<p>This is a <strong>bold</strong> and <em>italic</em> example.</p>
+<ul>
+  <li>Item 1</li>
+  <li>Item 2</li>
+  <li>Item 3</li>
+</ul>
+```
+
+## License
+
+MIT License, see [LICENSE](LICENSE) for details.
--- a/lib/MarkdownParser.pm
+++ b/lib/MarkdownParser.pm
@@ -0,0 +1,411 @@
+package MarkdownParser;
+
+use strict;
+use warnings;
+
+our $VERSION = '1.0';
+
+my %CLOSING_TAGS = (
+    ulist      => "</ul>",
+    olist      => "</ol>",
+    blockquote => "</blockquote>",
+    table      => "</table>",
+);
+
+sub new {
+    my $class = shift;
+    return bless {
+        state  => 'paragraph',
+        output => '',
+        lines  => [],
+    }, $class;
+}
+
+sub _init_handlers {
+    my $self = shift;
+    $self->{handlers} = {
+        code_block_start => sub {
+            my ( $self, $line_type ) = @_;
+            $self->transition_to_state('code_block');
+            $self->{output} .= "<pre><code>\n";
+        },
+        header => sub {
+            my ( $self, $line_type ) = @_;
+            $self->transition_to_state('paragraph');
+            my $level = length( $line_type->{match} );
+            $self->{output} .=
+                "<h$level>"
+              . $self->parse_inline( $line_type->{text} )
+              . "</h$level>\n";
+        },
+        blockquote => sub {
+            my ( $self, $line_type ) = @_;
+            $self->handle_list_or_blockquote( 'blockquote', '<blockquote>',
+                $line_type->{text} );
+        },
+        ulist => sub {
+            my ( $self, $line_type ) = @_;
+            $self->handle_list_or_blockquote( 'ulist', '<ul>',
+                $line_type->{text} );
+        },
+        olist => sub {
+            my ( $self, $line_type ) = @_;
+            $self->handle_list_or_blockquote( 'olist', '<ol>',
+                $line_type->{text} );
+        },
+        horizontal_rule => sub {
+            my ( $self, $line_type ) = @_;
+            $self->transition_to_state('paragraph');
+            $self->{output} .= "<hr>\n";
+        },
+        table_row => sub {
+            my ( $self, $line_type ) = @_;
+            if ( $self->{state} ne 'table' ) {
+                $self->transition_to_state('table');
+                $self->{output} .= "<table>\n";
+                $self->{table_is_header} = 1;
+            }
+            $self->handle_table_row( $line_type->{text} );
+        },
+        table_separator => sub {
+            my ( $self, $line_type ) = @_;
+            if ( $self->{state} eq 'table' ) {
+                $self->{table_is_header} = 0;
+            }
+        },
+        blank => sub {
+            my ( $self, $line_type ) = @_;
+            $self->finish_state();
+        },
+    };
+}
+
+sub parse {
+    my ( $self, $text ) = @_;
+    $self->{output} = '';
+    $self->{lines}  = [ split /\r?\n/, $text ];
+    $self->{state}  = 'paragraph';
+    delete $self->{paragraph_buffer};
+    $self->_init_handlers();
+
+    foreach my $line ( @{ $self->{lines} } ) {
+        $self->process_line($line);
+    }
+
+    $self->finish_state();
+    return $self->{output};
+}
+
+sub process_line {
+    my ( $self, $line ) = @_;
+
+    if ( $self->{state} eq 'code_block' ) {
+        $self->handle_code_block_line($line);
+        return;
+    }
+
+    if ( $self->{state} eq 'table' ) {
+        $self->handle_table_line($line);
+        return;
+    }
+
+    my $line_type = $self->detect_line_type($line);
+    my $handler   = $self->{handlers}->{ $line_type->{type} };
+    if ($handler) {
+        $handler->( $self, $line_type );
+    }
+    else {
+        $self->handle_paragraph_line($line);
+    }
+}
+
+sub detect_line_type {
+    my ( $self, $line ) = @_;
+
+    return { type => 'code_block_start' } if $line =~ /^```/;
+
+    if ( $line =~ /^(#{1,6})\s+(.+)/ ) {
+        return { type => 'header', match => $1, text => $2 };
+    }
+
+    if ( $line =~ /^>\s+(.+)/ ) {
+        return { type => 'blockquote', text => $1 };
+    }
+
+    if ( $line =~ /^[-*+]\s+(.+)/ ) {
+        return { type => 'ulist', text => $1 };
+    }
+
+    if ( $line =~ /^\d+\.\s+(.+)/ ) {
+        return { type => 'olist', text => $1 };
+    }
+
+    return { type => 'horizontal_rule' } if $line =~ /^[-*_]{3,}$/;
+    return { type => 'blank' }           if $line =~ /^\s*$/;
+
+    if ( $line =~ /^\|.+\|/ ) {
+        if ( $line =~ /^\|[\s\-:]*\|/ ) {
+            return { type => 'table_separator' };
+        }
+        return { type => 'table_row', text => $line };
+    }
+
+    return { type => 'paragraph' };
+}
+
+sub transition_to_state {
+    my ( $self, $new_state ) = @_;
+
+    if ( $self->{state} ne $new_state ) {
+        $self->finish_state();
+        $self->{state} = $new_state;
+    }
+}
+
+sub handle_code_block_line {
+    my ( $self, $line ) = @_;
+
+    if ( $line =~ /^```/ ) {
+        $self->{output} .= "</code></pre>\n";
+        $self->transition_to_state('paragraph');
+    }
+    else {
+        $self->{output} .= escape_html($line) . "\n";
+    }
+}
+
+sub handle_list_or_blockquote {
+    my ( $self, $target_state, $open_tag, $text ) = @_;
+
+    if ( $self->{state} ne $target_state ) {
+        $self->transition_to_state($target_state);
+        $self->{output} .= "$open_tag\n";
+    }
+
+    my $inner_tag = $target_state eq 'blockquote' ? 'p' : 'li';
+    $self->{output} .=
+      "<$inner_tag>" . $self->parse_inline($text) . "</$inner_tag>\n";
+}
+
+sub handle_paragraph_line {
+    my ( $self, $line ) = @_;
+
+    if ( $self->{state} ne 'paragraph' ) {
+        $self->transition_to_state('paragraph');
+    }
+
+    $self->{paragraph_buffer} //= '';
+    $self->{paragraph_buffer} .=
+      ( $self->{paragraph_buffer} ? ' ' : '' ) . $line;
+}
+
+sub handle_table_line {
+    my ( $self, $line ) = @_;
+
+    my $line_type = $self->detect_line_type($line);
+
+    if ( $line_type->{type} eq 'table_separator' ) {
+        $self->{table_is_header} = 0;
+        return;
+    }
+
+    if ( $line_type->{type} eq 'table_row' ) {
+        $self->handle_table_row( $line_type->{text} );
+        return;
+    }
+
+    $self->finish_state();
+    $self->process_line($line);
+}
+
+sub handle_table_row {
+    my ( $self, $row ) = @_;
+
+    $row =~ s/^\|\s*//;
+    $row =~ s/\s*\|$//;
+
+    my @cells = map { s/^\s+//; s/\s+$//; $_ } split( /\|/, $row );
+
+    $self->{output} .= "<tr>\n";
+    for my $cell (@cells) {
+        my $tag = $self->{table_is_header} ? 'th' : 'td';
+        $self->{output} .= "<$tag>" . $self->parse_inline($cell) . "</$tag>\n";
+    }
+    $self->{output} .= "</tr>\n";
+}
+
+sub finish_state {
+    my $self = shift;
+
+    if (   $self->{state} eq 'paragraph'
+        && exists $self->{paragraph_buffer}
+        && $self->{paragraph_buffer} =~ /\S/ )
+    {
+        $self->{output} .=
+          "<p>" . $self->parse_inline( $self->{paragraph_buffer} ) . "</p>\n";
+        delete $self->{paragraph_buffer};
+    }
+    elsif ( exists $CLOSING_TAGS{ $self->{state} } ) {
+        $self->{output} .= $CLOSING_TAGS{ $self->{state} } . "\n";
+    }
+
+    $self->{state} = 'paragraph';
+}
+
+sub parse_inline {
+    my ( $self, $text ) = @_;
+
+    my @placeholders;
+    my $placeholder_idx = 0;
+
+    $text =~ s/`([^`]+)`/sub {
+        my $idx = $placeholder_idx++;
+        push @placeholders, { type => 'code', content => $1 };
+        return "\x01$idx\x02";
+    }->()/ge;
+
+    $text =~ s/!\[([^\]]*)\]\(((?:[^()]|\([^()]*\))+)\)/sub {
+        my $idx = $placeholder_idx++;
+        push @placeholders, { type => 'image', alt => $1, url => $2 };
+        return "\x01$idx\x02";
+    }->()/ge;
+
+    $text =~ s/\[([^\]]+)\]\(((?:[^()]|\([^()]*\))+)\)/sub {
+        my $idx = $placeholder_idx++;
+        push @placeholders, { type => 'link', text => $1, url => $2 };
+        return "\x01$idx\x02";
+    }->()/ge;
+
+    my @bold_parts;
+    my $bold_idx = 0;
+    $text =~ s/\*\*\*((?:[^*]|\*(?!\*))+)\*\*\*/sub {
+        my $idx = $bold_idx++;
+        push @bold_parts, { type => 'bold', content => $1 };
+        return "\x01B$idx\x02";
+    }->()/ge;
+    $text =~ s/\*\*((?:[^*]|\*(?!\*))+)\*\*/sub {
+        my $idx = $bold_idx++;
+        push @bold_parts, { type => 'bold', content => $1 };
+        return "\x01B$idx\x02";
+    }->()/ge;
+    $text =~ s/(?<!\w)___((?:[^_]|_(?!_))+?)___(?!\w)/<strong>$1<\/strong>/g;
+    $text =~ s/(?<!\w)__((?:[^_]|_(?!_))+?)__(?!\w)/<strong>$1<\/strong>/g;
+
+    my @italic_parts;
+    my $italic_idx = 0;
+    $text =~ s/\*([^*]+)\*/sub {
+        my $idx = $italic_idx++;
+        push @italic_parts, { type => 'italic', content => $1 };
+        return "\x01I$idx\x02";
+    }->()/ge;
+    $text =~ s/(?<!\w)_((?:[^_]|_(?!_))+?)_(?!\w)/sub {
+        my $idx = $italic_idx++;
+        push @italic_parts, { type => 'italic', content => $1 };
+        return "\x01I$idx\x02";
+    }->()/ge;
+
+    for ( my $i = 0 ; $i < @bold_parts ; $i++ ) {
+        my $part    = $bold_parts[$i];
+        my $content = $part->{content};
+        $content =~ s/\*([^*]+)\*/sub {
+            my $idx = $italic_idx++;
+            push @italic_parts, { type => 'italic', content => $1 };
+            return "\x01I$idx\x02";
+        }->()/ge;
+        $content =~ s/(?<!\w)_((?:[^_]|_(?!_))+?)_(?!\w)/sub {
+            my $idx = $italic_idx++;
+            push @italic_parts, { type => 'italic', content => $1 };
+            return "\x01I$idx\x02";
+        }->()/ge;
+        $text =~ s/\x01B$i\x02/<strong>$content<\/strong>/;
+    }
+
+    my @format_parts;
+    my $format_idx = 0;
+    $text =~ s/<(strong|em)>(.*?)<\/(strong|em)>/sub {
+        my $idx = $format_idx++;
+        push @format_parts, { tag => $1, content => $2 };
+        return "\x01F$idx\x02";
+    }->()/gse;
+
+    $text = escape_html($text);
+
+    for ( my $i = 0 ; $i < @format_parts ; $i++ ) {
+        my $part = $format_parts[$i];
+        $text =~
+s/\x01F$i\x02/<$part->{tag}>@{[escape_html($part->{content})]}<\/$part->{tag}>/;
+    }
+
+    for ( my $i = 0 ; $i < @italic_parts ; $i++ ) {
+        my $part = $italic_parts[$i];
+        $text =~ s/\x01I$i\x02/<em>@{[escape_html($part->{content})]}<\/em>/;
+    }
+
+    for ( my $i = 0 ; $i < @placeholders ; $i++ ) {
+        my $part = $placeholders[$i];
+        my $replacement;
+        if ( $part->{type} eq 'code' ) {
+            $replacement =
+              "<code>" . escape_html( $part->{content} ) . "</code>";
+        }
+        elsif ( $part->{type} eq 'image' ) {
+            if ( is_safe_url( $part->{url} ) ) {
+                $replacement =
+                    "<img src=\""
+                  . escape_html( $part->{url} )
+                  . "\" alt=\""
+                  . escape_html( $part->{alt} ) . "\">";
+            }
+            else {
+                $replacement = escape_html( $part->{alt} );
+            }
+        }
+        elsif ( $part->{type} eq 'link' ) {
+            if ( is_safe_url( $part->{url} ) ) {
+                $replacement =
+                    "<a href=\""
+                  . escape_html( $part->{url} ) . "\">"
+                  . escape_html( $part->{text} ) . "</a>";
+            }
+            else {
+                $replacement = escape_html( $part->{text} );
+            }
+        }
+        $text =~ s/\x01$i\x02/$replacement/;
+    }
+
+    return $text;
+}
+
+sub is_safe_url {
+    my ($url) = @_;
+    my $normalized = $url // '';
+
+    $normalized =~ s/^\s+//;
+    $normalized =~ s/\s+$//;
+    $normalized =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg
+      while $normalized =~ /%[0-9A-Fa-f]{2}/;
+    $normalized =~ s/&#x([0-9A-Fa-f]+);?/chr(hex($1))/eg;
+    $normalized =~ s/&#(\d+);?/chr($1)/eg;
+
+    if ( $normalized =~ /^([a-z][a-z0-9+\-.]*):/i ) {
+        my $scheme = lc $1;
+        return 1
+          if $scheme eq 'http' || $scheme eq 'https' || $scheme eq 'mailto';
+        return 0;
+    }
+
+    return 1;
+}
+
+sub escape_html {
+    my ($text) = @_;
+    $text =~ s/&/&amp;/g;
+    $text =~ s/</&lt;/g;
+    $text =~ s/>/&gt;/g;
+    $text =~ s/"/&quot;/g;
+    $text =~ s/'/&#39;/g;
+    return $text;
+}
+
+1;
--- a/m2h.pl
+++ b/m2h.pl
@@ -0,0 +1,75 @@
+#!perl -w
+use strict;
+use Getopt::Long;
+use MarkdownParser;
+use open qw(:std :encoding(UTF-8));
+
+sub show_help {
+    print <<"EOF";
+m2h - Markdown to HTML Converter
+
+Usage: $0 [options] [file]
+
+Options:
+    -h, --help      Show this help message
+    -v, --version   Show version information
+    -o, --output    Specify output file (default: stdout)
+
+If no file is specified, input is read from stdin.
+EOF
+    exit 0;
+}
+
+sub show_version {
+    print "m2h version $MarkdownParser::VERSION\n";
+    exit 0;
+}
+
+sub read_input {
+    my ($file) = @_;
+    local $/;
+    if ($file) {
+        open my $fh, '<', $file
+          or die "Error: Cannot open file: $file\n";
+        binmode $fh, ':encoding(UTF-8)';
+        my $content = <$fh>;
+        close $fh;
+        return $content;
+    }
+    return <STDIN>;
+}
+
+my $output_file;
+my $help    = 0;
+my $version = 0;
+
+GetOptions(
+    'help|h'     => \$help,
+    'version|v'  => \$version,
+    'output|o=s' => \$output_file,
+) or show_help();
+
+show_help()    if $help;
+show_version() if $version;
+
+my $input_file = shift @ARGV;
+
+binmode STDIN,  ':encoding(UTF-8)';
+binmode STDOUT, ':encoding(UTF-8)';
+
+my $input = read_input($input_file);
+
+my $output;
+if ($output_file) {
+    open $output, '>', $output_file
+      or die "Error: Cannot write to file: $output_file\n";
+    binmode $output, ':encoding(UTF-8)';
+}
+else {
+    $output = \*STDOUT;
+}
+
+my $parser = MarkdownParser->new();
+print $output $parser->parse($input);
+
+close $output if $output_file;
--- a/t/00-load.t
+++ b/t/00-load.t
@@ -0,0 +1,9 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 1;
+use MarkdownParser;
+
+ok( 1, 'Module loaded successfully' );
+
--- a/t/01-headers.t
+++ b/t/01-headers.t
@@ -0,0 +1,16 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 6;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is( $parser->parse("# Header 1"),      "<h1>Header 1</h1>\n", "H1 header" );
+is( $parser->parse("## Header 2"),     "<h2>Header 2</h2>\n", "H2 header" );
+is( $parser->parse("### Header 3"),    "<h3>Header 3</h3>\n", "H3 header" );
+is( $parser->parse("#### Header 4"),   "<h4>Header 4</h4>\n", "H4 header" );
+is( $parser->parse("##### Header 5"),  "<h5>Header 5</h5>\n", "H5 header" );
+is( $parser->parse("###### Header 6"), "<h6>Header 6</h6>\n", "H6 header" );
+
--- a/t/02-paragraphs.t
+++ b/t/02-paragraphs.t
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 3;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is(
+    $parser->parse("Simple paragraph"),
+    "<p>Simple paragraph</p>\n",
+    "Single paragraph"
+);
+is(
+    $parser->parse("First paragraph\n\nSecond paragraph"),
+    "<p>First paragraph</p>\n<p>Second paragraph</p>\n",
+    "Multiple paragraphs"
+);
+is(
+    $parser->parse("Paragraph with\nmultiple lines"),
+    "<p>Paragraph with multiple lines</p>\n",
+    "Multi-line paragraph"
+);
+
--- a/t/03-formatting.t
+++ b/t/03-formatting.t
@@ -0,0 +1,68 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 13;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is(
+    $parser->parse("**bold text**"),
+    "<p><strong>bold text</strong></p>\n",
+    "Bold with **"
+);
+is(
+    $parser->parse("__bold text__"),
+    "<p><strong>bold text</strong></p>\n",
+    "Bold with __"
+);
+is(
+    $parser->parse("*italic text*"),
+    "<p><em>italic text</em></p>\n",
+    "Italic with *"
+);
+is(
+    $parser->parse("_italic text_"),
+    "<p><em>italic text</em></p>\n",
+    "Italic with _"
+);
+is(
+    $parser->parse("**bold** and *italic*"),
+    "<p><strong>bold</strong> and <em>italic</em></p>\n",
+    "Bold and italic together"
+);
+is(
+    $parser->parse("Text with **bold** in middle"),
+    "<p>Text with <strong>bold</strong> in middle</p>\n",
+    "Bold in middle of text"
+);
+is(
+    $parser->parse("Text with *italic* in middle"),
+    "<p>Text with <em>italic</em> in middle</p>\n",
+    "Italic in middle of text"
+);
+is(
+    $parser->parse("**bold** *italic* **bold again**"),
+"<p><strong>bold</strong> <em>italic</em> <strong>bold again</strong></p>\n",
+    "Multiple formatting"
+);
+is(
+    $parser->parse("***bold text***"),
+    "<p><strong>bold text</strong></p>\n",
+    "Bold with ***"
+);
+is(
+    $parser->parse("**bold *italic* bold**"),
+    "<p><strong>bold <em>italic</em> bold</strong></p>\n",
+    "Nested formatting"
+);
+is(
+    $parser->parse("___bold text___"),
+    "<p><strong>bold text</strong></p>\n",
+    "Bold with ___"
+);
+is( $parser->parse("my_variable"),
+    "<p>my_variable</p>\n", "Underscore inside word unchanged" );
+is( $parser->parse("CONST__VALUE"),
+    "<p>CONST__VALUE</p>\n", "Double underscores inside word unchanged" );
--- a/t/04-links-images.t
+++ b/t/04-links-images.t
@@ -0,0 +1,50 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 10;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is(
+    $parser->parse("[link text](http://example.com)"),
+    "<p><a href=\"http://example.com\">link text</a></p>\n",
+    "Simple link"
+);
+is(
+    $parser->parse("[link with spaces](https://example.com/path)"),
+    "<p><a href=\"https://example.com/path\">link with spaces</a></p>\n",
+    "Link with path"
+);
+is(
+    $parser->parse("![alt text](image.png)"),
+    "<p><img src=\"image.png\" alt=\"alt text\"></p>\n",
+    "Simple image"
+);
+is(
+    $parser->parse("![alt with spaces](http://example.com/image.jpg)"),
+"<p><img src=\"http://example.com/image.jpg\" alt=\"alt with spaces\"></p>\n",
+    "Image with URL"
+);
+is(
+    $parser->parse("[Click me](javascript:alert('XSS'))"),
+    "<p>Click me</p>\n",
+    "JavaScript protocol blocked in links"
+);
+is(
+    $parser->parse("[Click me](data:text/html,<script>alert('XSS')</script>)"),
+    "<p>Click me</p>\n",
+    "Data protocol blocked in links"
+);
+is(
+    $parser->parse("[Click me](javascript&#x3A;alert('XSS'))"),
+    "<p>Click me</p>\n",
+    "Encoded JavaScript protocol blocked in links"
+);
+is( $parser->parse("![Image](javascript:alert('XSS'))"),
+    "<p>Image</p>\n", "JavaScript protocol blocked in images" );
+is( $parser->parse("![Image](file:///etc/passwd)"),
+    "<p>Image</p>\n", "File protocol blocked in images" );
+is( $parser->parse("![Image](javascript:%2f%2falert('XSS'))"),
+    "<p>Image</p>\n", "Encoded JavaScript protocol blocked in images" );
--- a/t/05-lists.t
+++ b/t/05-lists.t
@@ -0,0 +1,40 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 6;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is(
+    $parser->parse("- Item 1\n- Item 2\n- Item 3"),
+    "<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n<li>Item 3</li>\n</ul>\n",
+    "Unordered list with -"
+);
+is(
+    $parser->parse("* Item 1\n* Item 2"),
+    "<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>\n",
+    "Unordered list with *"
+);
+is(
+    $parser->parse("+ Item 1\n+ Item 2"),
+    "<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>\n",
+    "Unordered list with +"
+);
+is(
+    $parser->parse("1. First item\n2. Second item\n3. Third item"),
+"<ol>\n<li>First item</li>\n<li>Second item</li>\n<li>Third item</li>\n</ol>\n",
+    "Ordered list"
+);
+is(
+    $parser->parse("- Item 1\n\n- Item 2"),
+    "<ul>\n<li>Item 1</li>\n</ul>\n<ul>\n<li>Item 2</li>\n</ul>\n",
+    "Multiple list blocks"
+);
+is(
+    $parser->parse("- **Bold item**\n- *Italic item*"),
+"<ul>\n<li><strong>Bold item</strong></li>\n<li><em>Italic item</em></li>\n</ul>\n",
+    "List items with formatting"
+);
+
--- a/t/06-code.t
+++ b/t/06-code.t
@@ -0,0 +1,35 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 5;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is(
+    $parser->parse("`inline code`"),
+    "<p><code>inline code</code></p>\n",
+    "Inline code"
+);
+is(
+    $parser->parse("Text with `code` in it"),
+    "<p>Text with <code>code</code> in it</p>\n",
+    "Inline code in text"
+);
+is(
+    $parser->parse("```\ncode block\n```"),
+    "<pre><code>\ncode block\n</code></pre>\n",
+    "Code block"
+);
+is(
+    $parser->parse("```\nline 1\nline 2\nline 3\n```"),
+    "<pre><code>\nline 1\nline 2\nline 3\n</code></pre>\n",
+    "Multi-line code block"
+);
+is(
+    $parser->parse("```\n```"),
+    "<pre><code>\n</code></pre>\n",
+    "Empty code block"
+);
+
--- a/t/07-blockquotes.t
+++ b/t/07-blockquotes.t
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 3;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is(
+    $parser->parse("> Quote text"),
+    "<blockquote>\n<p>Quote text</p>\n</blockquote>\n",
+    "Simple blockquote"
+);
+is(
+    $parser->parse("> First line\n> Second line"),
+    "<blockquote>\n<p>First line</p>\n<p>Second line</p>\n</blockquote>\n",
+    "Multi-line blockquote"
+);
+is(
+    $parser->parse("> Quote with **bold**"),
+    "<blockquote>\n<p>Quote with <strong>bold</strong></p>\n</blockquote>\n",
+    "Blockquote with formatting"
+);
+
--- a/t/08-horizontal-rules.t
+++ b/t/08-horizontal-rules.t
@@ -0,0 +1,13 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 3;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is( $parser->parse("---"), "<hr>\n", "Horizontal rule with ---" );
+is( $parser->parse("***"), "<hr>\n", "Horizontal rule with ***" );
+is( $parser->parse("___"), "<hr>\n", "Horizontal rule with ___" );
+
--- a/t/09-complex.t
+++ b/t/09-complex.t
@@ -0,0 +1,97 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 4;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+my $input = <<'EOF';
+# Title
+
+This is a paragraph with **bold** and *italic* text.
+
+- List item 1
+- List item 2
+
+[Link](http://example.com)
+EOF
+
+my $expected = <<'EOF';
+<h1>Title</h1>
+<p>This is a paragraph with <strong>bold</strong> and <em>italic</em> text.</p>
+<ul>
+<li>List item 1</li>
+<li>List item 2</li>
+</ul>
+<p><a href="http://example.com">Link</a></p>
+EOF
+
+is( $parser->parse($input), $expected, "Complex document" );
+
+$input = <<'EOF';
+## Section
+
+Paragraph one.
+
+> Blockquote here
+
+Another paragraph.
+EOF
+
+$expected = <<'EOF';
+<h2>Section</h2>
+<p>Paragraph one.</p>
+<blockquote>
+<p>Blockquote here</p>
+</blockquote>
+<p>Another paragraph.</p>
+EOF
+
+is( $parser->parse($input), $expected, "Document with blockquote" );
+
+$input = <<'EOF';
+# Code Example
+
+Here is some `inline code` and a code block:
+
+```
+function test() {
+    return true;
+}
+```
+EOF
+
+$expected = <<'EOF';
+<h1>Code Example</h1>
+<p>Here is some <code>inline code</code> and a code block:</p>
+<pre><code>
+function test() {
+    return true;
+}
+</code></pre>
+EOF
+
+is( $parser->parse($input), $expected, "Document with code" );
+
+$input = <<'EOF';
+# Header
+
+Paragraph with [link](url) and ![image](img.png).
+
+---
+
+## Another Header
+EOF
+
+$expected = <<'EOF';
+<h1>Header</h1>
+<p>Paragraph with <a href="url">link</a> and <img src="img.png" alt="image">.</p>
+<hr>
+<h2>Another Header</h2>
+EOF
+
+is( $parser->parse($input),
+    $expected, "Document with links, images, and horizontal rule" );
+
--- a/t/10-html-escape.t
+++ b/t/10-html-escape.t
@@ -0,0 +1,30 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 4;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+is(
+    $parser->parse("Text with <tag>"),
+    "<p>Text with &lt;tag&gt;</p>\n",
+    "HTML tags escaped"
+);
+is(
+    $parser->parse("Text with & symbol"),
+    "<p>Text with &amp; symbol</p>\n",
+    "Ampersand escaped"
+);
+is(
+    $parser->parse('Text with "quotes"'),
+    "<p>Text with &quot;quotes&quot;</p>\n",
+    "Quotes escaped"
+);
+is(
+    $parser->parse("Text with 'apostrophe'"),
+    "<p>Text with &#39;apostrophe&#39;</p>\n",
+    "Apostrophe escaped"
+);
+
--- a/t/11-tables.t
+++ b/t/11-tables.t
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+use Test::More tests => 4;
+use MarkdownParser;
+
+my $parser = MarkdownParser->new();
+
+my $input = <<'EOF';
+| Header 1 | Header 2 |
+|----------|----------|
+| Cell 1   | Cell 2   |
+| Cell 3   | Cell 4   |
+EOF
+
+my $expected = <<'EOF';
+<table>
+<tr>
+<th>Header 1</th>
+<th>Header 2</th>
+</tr>
+<tr>
+<td>Cell 1</td>
+<td>Cell 2</td>
+</tr>
+<tr>
+<td>Cell 3</td>
+<td>Cell 4</td>
+</tr>
+</table>
+EOF
+
+is( $parser->parse($input), $expected, "Basic table" );
+
+$input = <<'EOF';
+| Name | Age |
+|------|-----|
+| John | 25  |
+| Jane | 30  |
+EOF
+
+$expected = <<'EOF';
+<table>
+<tr>
+<th>Name</th>
+<th>Age</th>
+</tr>
+<tr>
+<td>John</td>
+<td>25</td>
+</tr>
+<tr>
+<td>Jane</td>
+<td>30</td>
+</tr>
+</table>
+EOF
+
+is( $parser->parse($input), $expected, "Table with different content" );
+
+$input = <<'EOF';
+# Title
+
+| Col1 | Col2 |
+|------|------|
+| Data | Info |
+
+More text.
+EOF
+
+$expected = <<'EOF';
+<h1>Title</h1>
+<table>
+<tr>
+<th>Col1</th>
+<th>Col2</th>
+</tr>
+<tr>
+<td>Data</td>
+<td>Info</td>
+</tr>
+</table>
+<p>More text.</p>
+EOF
+
+is( $parser->parse($input), $expected, "Table with surrounding content" );
+
+$input = <<'EOF';
+| **Bold** | *Italic* | [Link](url) |
+|----------|----------|-------------|
+| Text     | More     | Info        |
+EOF
+
+$expected = <<'EOF';
+<table>
+<tr>
+<th><strong>Bold</strong></th>
+<th><em>Italic</em></th>
+<th><a href="url">Link</a></th>
+</tr>
+<tr>
+<td>Text</td>
+<td>More</td>
+<td>Info</td>
+</tr>
+</table>
+EOF
+
+is( $parser->parse($input), $expected, "Table with inline formatting" );
+
Author	SHA1	Message	Date
Kharec	da24af38e3	test: add underscores-in-words regression tests for emphasis	2025-12-11 16:38:20 +01:00
Kharec	6670e81640	feat: harden emphasis regexes to avoid matching inside words	2025-12-11 16:37:46 +01:00
Kharec	3459e91645	test: add encoded protocol XSS regression tests for links/images	2025-12-11 15:58:07 +01:00
Kharec	9bd98b4fb9	feat: improve url sanitization	2025-12-11 15:57:25 +01:00
Kharec	cf77dd5cf2	feat: add explicit UTF-8 handling	2025-12-11 14:36:53 +01:00
Kharec	cee5bc89fa	docs: update requirements in readme	2025-11-23 11:36:08 +01:00
Kharec	c43f718495	feat: use Getopt::Long to handle command line options	2025-11-23 11:34:56 +01:00
Kharec	c80e7d8e6b	refactor: extract file-reading logic into a helper function	2025-11-18 07:35:39 +01:00
Kharec	caaecb1661	docs: update readme	2025-11-12 21:02:39 +01:00
Kharec	2a7daa13fd	feat: main module	2025-11-12 19:28:52 +01:00
Kharec	6e9ce92cde	feat: add unit tests	2025-11-12 19:28:48 +01:00
Kharec	0eca6d6725	feat: main script	2025-11-12 19:27:59 +01:00
Kharec	9c970a84bf	build: add makefile	2025-11-12 19:27:55 +01:00
Kharec	bc3f28487c	docs: add readme	2025-11-12 19:27:49 +01:00