s around # "paragraphs" that are wrapped in non-block-level tags, such as anchors, # phrase emphasis, and spans. The list of tags we're looking for is # hard-coded: my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/; my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/; # First, look for nested blocks, e.g.: #
tags around block-level tags.
$text = _HashHTMLBlocks($text);
$text = _FormParagraphs($text);
return $text;
}
sub _RunSpanGamut {
#
# These are all the transformations that occur *within* block-level
# tags like paragraphs, headers, and list items.
#
my $text = shift || return '';
$text = _DoCodeSpans($text);
# Fix unencoded ampersands and <'s:
$text = _EncodeAmpsAndAngles($text);
# Process anchor and image tags. Images must come first,
# because ![foo][f] looks like an anchor.
$text = _DoImages($text);
$text = _DoAnchors($text);
$text = _DoItalicsAndBold($text);
# Do hard breaks:
$text =~ s/ {2,}\n/ Just type tags
#
my $text = shift || return '';
# Strip leading and trailing lines:
$text =~ s/\A\n+//;
$text =~ s/\n+\z//;
my @grafs = split(/\n{2,}/, $text);
my $count = scalar @grafs;
#
# Wrap tags.
#
foreach (@grafs) {
unless (defined( $g_html_blocks{$_} )) {
$_ = _RunSpanGamut($_);
s/^([ \t]*)/ /;
$_ .= "
or tags.
# my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
foreach my $cur_token (@$tokens) {
if ($cur_token->[0] eq "tag") {
# Within tags, encode * and _ so they don't conflict
# with their use in Markdown for italics and strong.
# We're replacing each such character with its
# corresponding MD5 checksum value; this is likely
# overkill, but it should prevent us from colliding
# with the escape values by accident.
$cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
$cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
$text .= $cur_token->[1];
} else {
my $t = $cur_token->[1];
$t = _EncodeBackslashEscapes($t);
$text .= $t;
}
}
return $text;
}
sub _DoAnchors {
#
# Turn Markdown link shortcuts into XHTML tags.
#
my $text = shift || return '';
#
# First, handle reference-style links: [link text] [id]
#
$text =~ s{
( # wrap whole match in $1
\[
($g_nested_brackets) # link text = $2
\]
[ ]? # one optional space
(?:\n[ ]*)? # one optional newline followed by spaces
\[
(.*?) # id = $3
\]
)
}{
my $result;
my $whole_match = $1;
my $link_text = $2;
my $link_id = lc $3;
if ($link_id eq "") {
$link_id = lc $link_text; # for shortcut links like [this][].
}
if (defined $g_urls{$link_id}) {
my $url = $g_urls{$link_id};
$url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
$url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
$result = "? # href = $3
[ \t]*
( # $4
(['"]) # quote char = $5
(.*?) # Title = $6
\5 # matching quote
)? # title is optional
\)
)
}{
my $result;
my $whole_match = $1;
my $link_text = $2;
my $url = $3;
my $title = $6;
$url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
$url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
$result = " tags.
#
my $text = shift || return '';
#
# First, handle reference-style labeled images: ![alt text][id]
#
$text =~ s{
( # wrap whole match in $1
!\[
(.*?) # alt text = $2
\]
[ ]? # one optional space
(?:\n[ ]*)? # one optional newline followed by spaces
\[
(.*?) # id = $3
\]
)
}{
my $result;
my $whole_match = $1;
my $alt_text = $2;
my $link_id = lc $3;
if ($link_id eq "") {
$link_id = lc $alt_text; # for shortcut links like ![this][].
}
$alt_text =~ s/"/"/g;
if (defined $g_urls{$link_id}) {
my $url = $g_urls{$link_id};
$url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
$url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
$result = "? # src url = $3
[ \t]*
( # $4
(['"]) # quote char = $5
(.*?) # title = $6
\5 # matching quote
[ \t]*
)? # title is optional
\)
)
}{
my $result;
my $whole_match = $1;
my $alt_text = $2;
my $url = $3;
my $title = '';
if (defined($6)) {
$title = $6;
}
$alt_text =~ s/"/"/g;
$title =~ s/"/"/g;
$url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
$url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
$result = "
" . _RunSpanGamut($1) . "\n\n";
}egx;
$text =~ s{ (.+)[ \t]*\n-+[ \t]*\n+ }{
"
" . _RunSpanGamut($1) . "
\n\n";
}egx;
# atx-style headers:
# # Header 1
# ## Header 2
# ## Header 2 with closing hashes ##
# ...
# ###### Header 6
#
$text =~ s{
^(\#{1,6}) # $1 = string of #'s
[ \t]*
(.+?) # $2 = Header text
[ \t]*
\#* # optional closing #'s (not counted)
\n+
}{
my $h_level = length($1);
"` blocks.
#
my $text = shift || return '';
$text =~ s{
(?:\n\n|\A)
( # $1 = the code block -- one or more lines, starting with a space/tab
(?:
(?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
.*\n+
)+
)
((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
}{
my $codeblock = $1;
my $result; # return value
$codeblock = _EncodeCode(_Outdent($codeblock));
$codeblock = _Detab($codeblock);
$codeblock =~ s/\A\n+//; # trim leading newlines
$codeblock =~ s/\s+\z//; # trim trailing whitespace
$result = "\n\n";
@egsx;
return $text;
}
sub _EncodeCode {
#
# Encode/escape certain characters inside Markdown code runs.
# The point is that in code, these characters are literals,
# and lose their special Markdown meanings.
#
local $_ = shift || return '';
# Encode all ampersands; HTML entities are not
# entities within a Markdown code span.
s/&/&/g;
# Do the angle bracket song and dance:
s! < !<!gx;
s! > !>!gx;
# Now, escape characters that are magic in Markdown:
s! \* !$g_escape_table{'*'}!gx;
s! _ !$g_escape_table{'_'}!gx;
s! { !$g_escape_table{'{'}!gx;
s! } !$g_escape_table{'}'}!gx;
s! \[ !$g_escape_table{'['}!gx;
s! \] !$g_escape_table{']'}!gx;
s! \\ !$g_escape_table{'\\'}!gx;
return $_;
}
sub _DoItalicsAndBold {
my $text = shift || return '';
# must go first:
$text =~ s{ (\*\*|__) (?=\S) (.+?) (?<=\S) \1 }{$2}gsx;
# Then :
$text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }{$2}gsx;
return $text;
}
sub _DoBlockQuotes {
my $text = shift || return '';
$text =~ s{
( # Wrap whole match in $1
(
^[ \t]*>[ \t]? # '>' at the start of a line
.+\n # rest of the first line
(.+\n)* # subsequent consecutive lines
\n* # blanks
)+
)
}{
my $bq = $1;
$bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
$bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
$bq = _RunBlockGamut($bq); # recurse
$bq =~ s/^/ /g;
# These leading spaces screw with
\n\n";
$result;
}egmx;
return $text;
}
sub _DoCodeSpans {
#
# * Backtick quotes are used for " . $codeblock . "\n spans.
#
# * You can use multiple backticks as the delimiters if you want to
# include literal backticks in the code span. So, this input:
#
# Just type ``foo `bar` baz`` at the prompt.
#
# Will translate to:
#
# foo `bar` baz at the prompt.`bar` ...
#
my $text = shift || return '';
$text =~ s@
(`+) # $1 = Opening run of `
(.+?) # $2 = The code block
(?$c content, so we need to fix that:
$bq =~ s{
(\s*.+?
)
}{
my $pre = $1;
$pre =~ s/^ //mg;
$pre;
}egsx;
"\n$bq\n
\n\n";
}egmx;
return $text;
}
sub _FormParagraphs {
#
# Params:
# $text - string to process with html as well).
For more information about Markdown's syntax, see:
http://daringfireball.net/projects/markdown/
=head1 OPTIONS
Use "--" to end switch parsing. For example, to open a file named "-z", use:
Markdown.pl -- -z
=over 4
=item B<--html4tags>
Use HTML 4 style for empty element tags, e.g.:
instead of Markdown's default XHTML style tags, e.g.:
=item B<-v>, B<--version>
Display Markdown's version number and copyright information.
=item B<-s>, B<--shortversion>
Display the short-form version number.
=back
=head1 BUGS
To file bug reports or feature requests (other than topics listed in the
Caveats section above) please send email to:
support@daringfireball.net
Please include with your report: (1) the example input; (2) the output
you expected; (3) the output Markdown actually produced.
=head1 VERSION HISTORY
1.0:
+ Blockquote contents are once again indented by two spaces,
and `` content is special-cased.
1.0fc2:
+ Disabled the 'use utf8' pragma, which caused crashes for people
running Markdown on Perl 5.6.1.
+ Fixed a couple of bugs in _DoLists() and _ProcessListItems() that
caused unordered lists starting with `+` or `-` to be turned into
*ordered* lists.
+ Added to the list of block-level HTML tags:
noscript, form, fieldset, iframe, math
+ Fixed an odd bug where, with input like this:
> This line starts the blockquote
* This list is part of the quote.
* Second item.
This paragraph is not part of the blockquote.
The trailing paragraph was incorrectly included in the
blockquote. (The solution was to add an extra "\n" after
lists.)
+ The contents of `
` tags are no longer indented
in the HTML output. It made the source look neater, but
screwed with any `
` blocks in the blockquote.
+ When running under MT 3.0 or later, now displays version number
in the Markdown info on the main screen.
1.0fc1:
+ Added some MT 3.0 stuff to register the plug-in in the MT web UI.
This should not prevent Markdown from running under MT 2.6.
+ Greatly simplified the rules for code blocks. No more colons
necessary; if it's indented (4 spaces or 1 tab), it's a code block.
+ Unordered list items can now be denoted by any of the following
bullet markers: [*+-]
+ _DoCodeSpans() now uses a much simpler regex pattern. Thanks to
Michel Fortin for the patch.
* s/"/"/g to fix literal quotes within title attributes.
1.0b9:
* s/"/"/g to fix literal quotes within img alt attributes.
1.0b8:
* Tweaked file slurping syntax.
* Added 'math' tags to block-level tag patterns in _HashHTMLBlocks().
Please disregard all the 'math'-tag related items in 1.0b7.
* Commented out some vestigial code in _EscapeSpecialChars()
1.0b7:
* Added 'math' to $tags_to_skip pattern, for MathML users.
* Tweaked regex for identifying HTML entities in
_EncodeAmpsAndAngles(), so as to allow for the very long entity
names used by MathML. (Thanks to Jacques Distler for the patch.)
* _DoCodeSpans() now uses 'no strict' inside the ??{} construct
in the regex pattern, which (the addition of 'no strict') allows
us to use $backtick_count as an undeclared variable. Perl 5.8.4
complains without a 'my' here under 'use strict', but earlier
versions of Perl *won't* allow 'my' here. Bizarrely, the
opposite is true of 'local', which works fine under Perl 5.6.1 -
5.8.3, but doesn't work under Perl 5.8.4.
* All the internal subroutines now return an empty string if
called without a text parameter. It's my hope that this will
let Markdown.pl work better under Perl 5.6.0.
1.0b6:
* as block-level tags.
This only works if the start and end tags are on lines by
themselves.
* Three or more underscores can now be used for horizontal rules.
* Lines containing only whitespace are trimmed from blockquotes.
* You can now optionally wrap URLs with angle brackets -- like so:
`