Compare commits

..

9 Commits

5 changed files with 69 additions and 47 deletions

View File

@@ -1,4 +1,4 @@
# m2h - Markdown to HTML Converter # m2h
A lightweight, pure Perl markdown to HTML converter that uses a state machine for parsing. A lightweight, pure Perl markdown to HTML converter that uses a state machine for parsing.
@@ -7,12 +7,13 @@ A lightweight, pure Perl markdown to HTML converter that uses a state machine fo
- Pure Perl implementation - no external dependencies - Pure Perl implementation - no external dependencies
- State machine-based parsing for efficient and maintainable code - State machine-based parsing for efficient and maintainable code
- Converts standard markdown syntax to HTML - Converts standard markdown syntax to HTML
- Secure HTML output, especially against XSS or file protocols - Secure HTML output, especially against XSS or file protocols
- Fast and lightweight - Fast and lightweight
## Requirements ## Requirements
- Perl 5.42 or higher - Perl 5.42 or higher
- Getopt::Long (but it's already installed on most Perl installations)
## Installation ## Installation

View File

@@ -288,8 +288,8 @@ sub parse_inline {
push @bold_parts, { type => 'bold', content => $1 }; push @bold_parts, { type => 'bold', content => $1 };
return "\x01B$idx\x02"; return "\x01B$idx\x02";
}->()/ge; }->()/ge;
$text =~ s/___((?:[^_]|_(?!_))+?)___/<strong>$1<\/strong>/g; $text =~ s/(?<!\w)___((?:[^_]|_(?!_))+?)___(?!\w)/<strong>$1<\/strong>/g;
$text =~ s/__((?:[^_]|_(?!_))+?)__/<strong>$1<\/strong>/g; $text =~ s/(?<!\w)__((?:[^_]|_(?!_))+?)__(?!\w)/<strong>$1<\/strong>/g;
my @italic_parts; my @italic_parts;
my $italic_idx = 0; my $italic_idx = 0;
@@ -298,7 +298,7 @@ sub parse_inline {
push @italic_parts, { type => 'italic', content => $1 }; push @italic_parts, { type => 'italic', content => $1 };
return "\x01I$idx\x02"; return "\x01I$idx\x02";
}->()/ge; }->()/ge;
$text =~ s/_([^_]+)_/sub { $text =~ s/(?<!\w)_((?:[^_]|_(?!_))+?)_(?!\w)/sub {
my $idx = $italic_idx++; my $idx = $italic_idx++;
push @italic_parts, { type => 'italic', content => $1 }; push @italic_parts, { type => 'italic', content => $1 };
return "\x01I$idx\x02"; return "\x01I$idx\x02";
@@ -312,7 +312,7 @@ sub parse_inline {
push @italic_parts, { type => 'italic', content => $1 }; push @italic_parts, { type => 'italic', content => $1 };
return "\x01I$idx\x02"; return "\x01I$idx\x02";
}->()/ge; }->()/ge;
$content =~ s/_([^_]+)_/sub { $content =~ s/(?<!\w)_((?:[^_]|_(?!_))+?)_(?!\w)/sub {
my $idx = $italic_idx++; my $idx = $italic_idx++;
push @italic_parts, { type => 'italic', content => $1 }; push @italic_parts, { type => 'italic', content => $1 };
return "\x01I$idx\x02"; return "\x01I$idx\x02";
@@ -379,10 +379,22 @@ s/\x01F$i\x02/<$part->{tag}>@{[escape_html($part->{content})]}<\/$part->{tag}>/;
sub is_safe_url { sub is_safe_url {
my ($url) = @_; my ($url) = @_;
return 0 if $url =~ /^\s*javascript:/i; my $normalized = $url // '';
return 0 if $url =~ /^\s*data:/i;
return 0 if $url =~ /^\s*vbscript:/i; $normalized =~ s/^\s+//;
return 0 if $url =~ /^\s*file:/i; $normalized =~ s/\s+$//;
$normalized =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg
while $normalized =~ /%[0-9A-Fa-f]{2}/;
$normalized =~ s/&#x([0-9A-Fa-f]+);?/chr(hex($1))/eg;
$normalized =~ s/&#(\d+);?/chr($1)/eg;
if ( $normalized =~ /^([a-z][a-z0-9+\-.]*):/i ) {
my $scheme = lc $1;
return 1
if $scheme eq 'http' || $scheme eq 'https' || $scheme eq 'mailto';
return 0;
}
return 1; return 1;
} }
@@ -397,4 +409,3 @@ sub escape_html {
} }
1; 1;

65
m2h.pl
View File

@@ -1,6 +1,8 @@
#!perl -w #!perl -w
use strict; use strict;
use Getopt::Long;
use MarkdownParser; use MarkdownParser;
use open qw(:std :encoding(UTF-8));
sub show_help { sub show_help {
print <<"EOF"; print <<"EOF";
@@ -23,46 +25,45 @@ sub show_version {
exit 0; exit 0;
} }
sub read_input {
my ($file) = @_;
local $/;
if ($file) {
open my $fh, '<', $file
or die "Error: Cannot open file: $file\n";
binmode $fh, ':encoding(UTF-8)';
my $content = <$fh>;
close $fh;
return $content;
}
return <STDIN>;
}
my $output_file; my $output_file;
my $input_file; my $help = 0;
my $version = 0;
for ( my $i = 0 ; $i < @ARGV ; $i++ ) { GetOptions(
my $arg = $ARGV[$i]; 'help|h' => \$help,
if ( $arg eq '-h' || $arg eq '--help' ) { 'version|v' => \$version,
show_help(); 'output|o=s' => \$output_file,
} ) or show_help();
elsif ( $arg eq '-v' || $arg eq '--version' ) {
show_version();
}
elsif ( $arg eq '-o' || $arg eq '--output' ) {
$output_file = $ARGV[ ++$i ]
or die "Error: -o requires a filename\n";
}
elsif ( $arg =~ /^-/ ) {
die "Error: Unknown option: $arg\n";
}
else {
$input_file = $arg;
}
}
my $input; show_help() if $help;
if ($input_file) { show_version() if $version;
open my $fh, '<', $input_file
or die "Error: Cannot open file: $input_file\n"; my $input_file = shift @ARGV;
local $/;
$input = <$fh>; binmode STDIN, ':encoding(UTF-8)';
close $fh; binmode STDOUT, ':encoding(UTF-8)';
}
else { my $input = read_input($input_file);
local $/;
$input = <STDIN>;
}
my $output; my $output;
if ($output_file) { if ($output_file) {
open $output, '>', $output_file open $output, '>', $output_file
or die "Error: Cannot write to file: $output_file\n"; or die "Error: Cannot write to file: $output_file\n";
binmode $output, ':encoding(UTF-8)';
} }
else { else {
$output = \*STDOUT; $output = \*STDOUT;

View File

@@ -2,7 +2,7 @@
use strict; use strict;
use warnings; use warnings;
use Test::More tests => 11; use Test::More tests => 13;
use MarkdownParser; use MarkdownParser;
my $parser = MarkdownParser->new(); my $parser = MarkdownParser->new();
@@ -62,4 +62,7 @@ is(
"<p><strong>bold text</strong></p>\n", "<p><strong>bold text</strong></p>\n",
"Bold with ___" "Bold with ___"
); );
is( $parser->parse("my_variable"),
"<p>my_variable</p>\n", "Underscore inside word unchanged" );
is( $parser->parse("CONST__VALUE"),
"<p>CONST__VALUE</p>\n", "Double underscores inside word unchanged" );

View File

@@ -2,7 +2,7 @@
use strict; use strict;
use warnings; use warnings;
use Test::More tests => 8; use Test::More tests => 10;
use MarkdownParser; use MarkdownParser;
my $parser = MarkdownParser->new(); my $parser = MarkdownParser->new();
@@ -37,8 +37,14 @@ is(
"<p>Click me</p>\n", "<p>Click me</p>\n",
"Data protocol blocked in links" "Data protocol blocked in links"
); );
is(
$parser->parse("[Click me](javascript&#x3A;alert('XSS'))"),
"<p>Click me</p>\n",
"Encoded JavaScript protocol blocked in links"
);
is( $parser->parse("![Image](javascript:alert('XSS'))"), is( $parser->parse("![Image](javascript:alert('XSS'))"),
"<p>Image</p>\n", "JavaScript protocol blocked in images" ); "<p>Image</p>\n", "JavaScript protocol blocked in images" );
is( $parser->parse("![Image](file:///etc/passwd)"), is( $parser->parse("![Image](file:///etc/passwd)"),
"<p>Image</p>\n", "File protocol blocked in images" ); "<p>Image</p>\n", "File protocol blocked in images" );
is( $parser->parse("![Image](javascript:%2f%2falert('XSS'))"),
"<p>Image</p>\n", "Encoded JavaScript protocol blocked in images" );