#!/usr/bin/ruby # # Bluecloth is a Ruby implementation of Markdown, a text-to-HTML conversion # tool. # # == Synopsis # # doc = BlueCloth.new " # ## Test document ## # # Just a simple test. # " # # puts doc.to_html # # == Authors # # * Michael Granger # # == Contributors # # * Martin Chase - Peer review, helpful suggestions # * Florian Gross - Filter options, suggestions # # == Copyright # # Original version: # Copyright (c) 2004, 2005, John Gruber # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # * Neither the name "Markdown" nor the names of its contributors may # be used to endorse or promote products derived from this software # without specific prior written permission. # # This software is provided by the copyright holders and contributors "as # is" and any express or implied warranties, including, but not limited # to, the implied warranties of merchantability and fitness for a # particular purpose are disclaimed. In no event shall the copyright owner # or contributors be liable for any direct, indirect, incidental, special, # exemplary, or consequential damages (including, but not limited to, # procurement of substitute goods or services; loss of use, data, or # profits; or business interruption) however caused and on any theory of # liability, whether in contract, strict liability, or tort (including # negligence or otherwise) arising in any way out of the use of this # software, even if advised of the possibility of such damage. # # Ruby port: # Copyright (c) 2004, 2005 The FaerieMUD Consortium. # # You may use, modify, and/or redistribute this software under the same terms # as Ruby itself. A copy of Ruby's license should be included in this package; # if not, it can be obtained online at: # http://www.ruby-lang.org/en/LICENSE.txt. # # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # == To-do # # * Refactor some of the larger uglier methods that have to do their own # brute-force scanning because of lack of Perl features in Ruby's Regexp # class. Alternately, could add a dependency on 'pcre' and use most Perl # regexps. # # * Put the StringScanner in the render state for thread-safety. # # == Version # # $Id$ # require 'digest/md5' require 'logger' require 'strscan' ### BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion ### tool. class BlueCloth < String ### Exception class for formatting errors. class FormatError < RuntimeError ### Create a new FormatError with the given source +str+ and an optional ### message about the +specific+ error. def initialize( str, specific=nil ) if specific msg = "Bad markdown format near %p: %s" % [ str, specific ] else msg = "Bad markdown format near %p" % str end super( msg ) end end # Release Version VERSION = '1.1.0' # SVN Revision SVNREV = %q$Rev$ # SVN Id tag SVNID = %q$Id$ # Rendering state struct. Keeps track of URLs, titles, and HTML blocks # midway through a render. I prefer this to the globals of the Perl version # because globals make me break out in hives. Or something. RenderState = Struct.new( "RenderState", :urls, :titles, :html_blocks, :list_level, :log ) # Tab width for #detab! if none is specified # :TODO: Make this DEFAULT_TAB_WIDTH and make tab width a per-instance setting instead. TAB_WIDTH = 4 LESS_THAN_TAB_WIDTH = TAB_WIDTH - 1 # The tag-closing string -- set to '>' for HTML EMPTY_ELEMENT_SUFFIX = " />"; # Table of MD5 sums for escaped characters ESCAPE_TABLE = {} '\\`*_{}[]()>#+-.!'.split(//).each {|char| hash = Digest::MD5.hexdigest( char ) ESCAPE_TABLE[ char ] = { :md5 => hash, :md5re => Regexp.new( hash ), :re => Regexp.new( '\\\\' + Regexp.escape(char) ), } } ################################################################# ### I N S T A N C E M E T H O D S ################################################################# ### Create a new BlueCloth string. def initialize( content="", *restrictions ) @log = Logger.new( $deferr ) @log.level = $DEBUG ? Logger::DEBUG : ($VERBOSE ? Logger::INFO : Logger::WARN) @scanner = nil # Add any restrictions, and set the line-folding attribute to reflect # what happens by default. @filter_html = nil @filter_styles = nil restrictions.flatten.each {|r| __send__("#{r}=", true) } @fold_lines = true super( content ) @log.debug "String is: %p" % self end ###### public ###### # Filters for controlling what gets output for untrusted input. (But really, # you're filtering bad stuff out of untrusted input at submission-time via # untainting, aren't you?) attr_accessor :filter_html, :filter_styles # RedCloth-compatibility accessor. Line-folding is part of Markdown syntax, # so this isn't used by anything. attr_accessor :fold_lines ### Render Markdown-formatted text in this string object as HTML and return ### it. The parameter is for compatibility with RedCloth, and is currently ### unused, though that may change in the future. def to_html( lite=false ) # Create a StringScanner we can reuse for various lexing tasks @scanner = StringScanner.new( '' ) # Make a structure to carry around stuff that gets placeholdered out of # the source. rs = RenderState.new( {}, {}, {}, 0 ) # Make a copy of the string with normalized line endings, tabs turned to # spaces, and a couple of guaranteed newlines at the end text = self.gsub( /\r\n?/, "\n" ).detab text += "\n\n" @log.debug "Normalized line-endings: %p" % text # Filter HTML if we're asked to do so if self.filter_html text.gsub!( "<", "<" ) text.gsub!( ">", ">" ) @log.debug "Filtered HTML: %p" % text end # Simplify blank lines text.gsub!( /^ +$/, '' ) @log.debug "Tabs -> spaces/blank lines stripped: %p" % text # Replace HTML blocks with placeholders text = hide_html_blocks( text, rs ) @log.debug "Hid HTML blocks: %p" % text @log.debug "Render state: %p" % rs # Strip link definitions, store in render state text = strip_link_definitions( text, rs ) @log.debug "Stripped link definitions: %p" % text @log.debug "Render state: %p" % rs # Escape meta-characters text = escape_special_chars( text ) @log.debug "Escaped special characters: %p" % text # Transform block-level constructs text = apply_block_transforms( text, rs ) @log.debug "After block-level transforms: %p" % text # Now swap back in all the escaped characters text = unescape_special_chars( text ) @log.debug "After unescaping special characters: %p" % text return text end ### Convert tabs in +str+ to spaces. def detab( tabwidth=TAB_WIDTH ) copy = self.dup copy.detab!( tabwidth ) return copy end ### Convert tabs to spaces in place and return self if any were converted. def detab!( tabwidth=TAB_WIDTH ) newstr = self.split( /\n/ ).collect {|line| line.gsub( /(.*?)\t/ ) do $1 + ' ' * (tabwidth - $1.length % tabwidth) end }.join("\n") self.replace( newstr ) end ####### #private ####### ### Do block-level transforms on a copy of +str+ using the specified render ### state +rs+ and return the results. def apply_block_transforms( str, rs ) # Port: This was called '_runBlockGamut' in the original @log.debug "Applying block transforms to:\n %p" % str text = transform_headers( str, rs ) text = transform_hrules( text, rs ) text = transform_lists( text, rs ) text = transform_code_blocks( text, rs ) text = transform_block_quotes( text, rs ) text = hide_html_blocks( text, rs ) text = form_paragraphs( text, rs ) @log.debug "Done with block transforms:\n %p" % text return text end ### Apply Markdown span transforms to a copy of the specified +str+ with the ### given render state +rs+ and return it. def apply_span_transforms( str, rs ) @log.debug "Applying span transforms to:\n %p" % str str = transform_code_spans( str, rs ) str = escape_special_chars( str ) str = transform_images( str, rs ) str = transform_anchors( str, rs ) str = transform_auto_links( str, rs ) str = encode_html( str ) str = transform_italic_and_bold( str, rs ) # Hard breaks str.gsub!( / {2,}\n/, " #
# tags for inner block must be indented. #
# StrictBlockRegex = %r{ ^ # Start of line <(#{StrictTagPattern}) # Start tag: \2 \b # word break (.*\n)*? # Any number of lines, minimal match # Matching end tag [ ]* # trailing spaces $ # End of line or document }ix # More-liberal block-matching LooseBlockRegex = %r{ ^ # Start of line <(#{LooseTagPattern}) # start tag: \2 \b # word break (.*\n)*? # Any number of lines, minimal match .* # Anything + Matching end tag [ ]* # trailing spaces $ # End of line or document }ix # Special case for
. HruleBlockRegex = %r{ ( # $1 \A\n? # Start of doc + optional \n | # or .*\n\n # anything + blank line ) ( # save in $2 [ ]{0,#{LESS_THAN_TAB_WIDTH}} # Any spaces
])*? # Attributes /?> # Tag close $ # followed by a blank line or end of document ) }ix # Special case for standalone HTML comments CommentBlockRegex = %r{ ( # $1 \A\n? # Start of doc + optional \n | # or .*\n\n # anything + blank line ) ( # save in $2 [ ]{0,#{LESS_THAN_TAB_WIDTH}} # Any spaces (?: ) $ # followed by a blank line or end of document ) }ix ### Replace all blocks of HTML in +str+ that start in the left margin with ### tokens. def hide_html_blocks( str, rs ) @log.debug "Hiding HTML blocks in %p" % str # Tokenizer proc to pass to gsub tokenize = lambda {|match| key = Digest::MD5.hexdigest( match ) rs.html_blocks[ key ] = match @log.debug "Replacing %p with %p" % [ match, key ] "\n\n#{key}\n\n" } rval = str.dup @log.debug "Finding blocks with the strict regex..." rval.gsub!( StrictBlockRegex, &tokenize ) @log.debug "Finding blocks with the loose regex..." rval.gsub!( LooseBlockRegex, &tokenize ) @log.debug "Finding hrules..." rval.gsub!( HruleBlockRegex ) {|match| $1 + tokenize[$2] } @log.debug "Finding comments..." rval.gsub!( CommentBlockRegex ) {|match| $1 + tokenize[$2] } return rval end # Link defs are in the form: ^[id]: url "optional title" LinkRegex = %r{ ^[ ]{0,#{LESS_THAN_TAB_WIDTH}}\[(.+)\]: # id = $1 [ ]* \n? # maybe *one* newline [ ]* ? # url = $2 [ ]* \n? # maybe one newline [ ]* (?: # Titles are delimited by "quotes" or (parens). ["(] (.+?) # title = $3 [")] # Matching ) or " [ ]* )? # title is optional (?:\n+|\Z) }x ### Strip link definitions from +str+, storing them in the given RenderState ### +rs+. def strip_link_definitions( str, rs ) str.gsub( LinkRegex ) {|match| id, url, title = $1, $2, $3 rs.urls[ id.downcase ] = encode_html( url ) unless title.nil? rs.titles[ id.downcase ] = title.gsub( /"/, """ ) end "" } end ### Escape special characters in the given +str+ def escape_special_chars( str ) @log.debug " Escaping special characters" text = '' # Split the HTML into tags and text, calling back into this block for # each chunk. tokenize_html( str ) {|token, str| @log.debug " Adding %p token %p" % [ token, str ] case token # Within tags, encode * and _ when :tag text += str. gsub( /\*/, ESCAPE_TABLE['*'][:md5] ). gsub( /_/, ESCAPE_TABLE['_'][:md5] ) # Encode backslashed stuff in regular text when :text text += encode_backslash_escapes( str ) else raise TypeError, "Unknown token type %p" % token end } @log.debug " Text with escapes is now: %p" % text return text end ### Swap escaped special characters in a copy of the given +str+ and return ### it. def unescape_special_chars( str ) ESCAPE_TABLE.each {|char, hash| @log.debug "Unescaping escaped %p with %p" % [ char, hash[:md5re] ] str.gsub!( hash[:md5re], char ) } return str end ### Return a copy of the given +str+ with any backslashed special character ### in it replaced with MD5 placeholders. def encode_backslash_escapes( str ) # Make a copy with any double-escaped backslashes encoded text = str.gsub( /\\\\/, ESCAPE_TABLE['\\'][:md5] ) ESCAPE_TABLE.each_pair {|char, esc| next if char == '\\' text.gsub!( esc[:re], esc[:md5] ) } return text end ### Transform any Markdown-style horizontal rules in a copy of the specified ### +str+ and return it. def transform_hrules( str, rs ) @log.debug " Transforming horizontal rules" str.gsub( /^[ ]{0,2}( ?[\-\*_] ?){3,} *$/, "\n\n%s\n} % [ pre, list_type, transform_list_items( list, rs ), list_type, ] } end # Pattern for transforming list items ListItemRegexp = %r{ (\n)? # leading line = $1 (^[ ]*) # leading whitespace = $2 (#{ListMarkerAny}) [ ]+ # list marker = $3 ((?m:.+?) # list item text = $4 (\n{1,2})) (?= \n* (\z | \2 (#{ListMarkerAny}) [ ]+)) }x ### Transform list items in a copy of the given +str+ and return it. def transform_list_items( str, rs ) @log.debug " Transforming list items" # Increment the marker for parsing sublists rs.list_level += 1 # Trim trailing blank lines str = str.sub( /\n{2,}\z/, "\n" ) str.gsub( ListItemRegexp ) {|line| @log.debug " Found item line %p" % line leading_line, item = $1, $4 if leading_line or /\n{2,}/.match( item ) @log.debug " Found leading line or item has a blank" item = apply_block_transforms( outdent(item), rs ) else # Recursion for sub-lists @log.debug " Recursing for sublist" item = transform_lists( outdent(item), rs ).chomp item = apply_span_transforms( item, rs ) end %{
  • %s
  • \n} % item } ensure # Decrement the list-level counter rs.list_level -= 1 end # Pattern for matching codeblocks CodeBlockRegexp = %r{ (?:\n\n|\A) ( # $1 = the code block (?: (?:[ ]{#{TAB_WIDTH}} | \t) # a tab or tab-width of spaces .*\n+ )+ ) (^[ ]{0,#{TAB_WIDTH - 1}}\S|\Z) # Lookahead for non-space at # line-start, or end of doc }x ### Transform Markdown-style codeblocks in a copy of the specified +str+ and ### return it. def transform_code_blocks( str, rs ) @log.debug " Transforming code blocks" str.gsub( CodeBlockRegexp ) {|block| codeblock = $1 remainder = $2 # Generate the codeblock %{\n\n
    %s\n
    \n\n%s} % [ encode_code( outdent(codeblock), rs ).rstrip, remainder ] } end # Pattern for matching Markdown blockquote blocks BlockQuoteRegexp = %r{ (?: ^[ ]*>[ ]? # '>' at the start of a line .+\n # rest of the first line (?:.+\n)* # subsequent consecutive lines \n* # blanks )+ }x PreChunk = %r{ ( ^ \s*
     .+? 
    ) }xm ### Transform Markdown-style blockquotes in a copy of the specified +str+ ### and return it. def transform_block_quotes( str, rs ) @log.debug " Transforming block quotes" str.gsub( BlockQuoteRegexp ) {|quote| @log.debug "Making blockquote from %p" % quote quote.gsub!( /^ *> ?/, '' ) # Trim one level of quoting quote.gsub!( /^ +$/, '' ) # Trim whitespace-only lines indent = " " * TAB_WIDTH quoted = %{
    \n%s\n
    \n\n} % apply_block_transforms( quote, rs ). gsub( /^/, indent ). gsub( PreChunk ) {|m| m.gsub(/^#{indent}/o, '') } @log.debug "Blockquoted chunk is: %p" % quoted quoted } end AutoAnchorURLRegexp = /<((https?|ftp):[^'">\s]+)>/ AutoAnchorEmailRegexp = %r{ < ( [-.\w]+ \@ [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ ) > }xi ### Transform URLs in a copy of the specified +str+ into links and return ### it. def transform_auto_links( str, rs ) @log.debug " Transforming auto-links" str.gsub( AutoAnchorURLRegexp, %{\\1}). gsub( AutoAnchorEmailRegexp ) {|addr| encode_email_address( unescape_special_chars($1) ) } end # Encoder functions to turn characters of an email address into encoded # entities. Encoders = [ lambda {|char| "&#%03d;" % char}, lambda {|char| "&#x%X;" % char}, lambda {|char| char.chr }, ] ### Transform a copy of the given email +addr+ into an escaped version safer ### for posting publicly. def encode_email_address( addr ) rval = '' ("mailto:" + addr).each_byte {|b| case b when ?: rval += ":" when ?@ rval += Encoders[ rand(2) ][ b ] else r = rand(100) rval += ( r > 90 ? Encoders[2][ b ] : r < 45 ? Encoders[1][ b ] : Encoders[0][ b ] ) end } return %{%s} % [ rval, rval.sub(/.+?:/, '') ] end # Regex for matching Setext-style headers SetextHeaderRegexp = %r{ (.+) # The title text ($1) \n ([\-=])+ # Match a line of = or -. Save only one in $2. [ ]*\n+ }x # Regexp for matching ATX-style headers AtxHeaderRegexp = %r{ ^(\#{1,6}) # $1 = string of #'s [ ]* (.+?) # $2 = Header text [ ]* \#* # optional closing #'s (not counted) \n+ }x ### Apply Markdown header transforms to a copy of the given +str+ amd render ### state +rs+ and return the result. def transform_headers( str, rs ) @log.debug " Transforming headers" # Setext-style headers: # Header 1 # ======== # # Header 2 # -------- # str. gsub( SetextHeaderRegexp ) {|m| @log.debug "Found setext-style header" title, hdrchar = $1, $2 title = apply_span_transforms( title, rs ) case hdrchar when '=' %[

    #{title}

    \n\n] when '-' %[

    #{title}

    \n\n] else title end }. gsub( AtxHeaderRegexp ) {|m| @log.debug "Found ATX-style header" hdrchars, title = $1, $2 title = apply_span_transforms( title, rs ) level = hdrchars.length %{%s\n\n} % [ level, title, level ] } end ### Wrap all remaining paragraph-looking text in a copy of +str+ inside

    ### tags and return it. def form_paragraphs( str, rs ) @log.debug " Forming paragraphs" grafs = str. sub( /\A\n+/, '' ). sub( /\n+\z/, '' ). split( /\n{2,}/ ) rval = grafs.collect {|graf| # Unhashify HTML blocks if this is a placeholder if rs.html_blocks.key?( graf ) rs.html_blocks[ graf ] # Otherwise, wrap in

    tags else apply_span_transforms(graf, rs). sub( /^[ ]*/, '

    ' ) + '

    ' end }.join( "\n\n" ) @log.debug " Formed paragraphs: %p" % rval return rval end # Pattern to match the linkid part of an anchor tag for reference-style # links. RefLinkIdRegex = %r{ [ ]? # Optional leading space (?:\n[ ]*)? # Optional newline + spaces \[ (.*?) # Id = $1 \] }x InlineLinkRegex = %r{ \( # Literal paren [ ]* # Zero or more spaces ? # URI = $1 [ ]* # Zero or more spaces (?: # ([\"\']) # Opening quote char = $2 (.*?) # Title = $3 \2 # Matching quote char )? # Title is optional \) }x ### Apply Markdown anchor transforms to a copy of the specified +str+ with ### the given render state +rs+ and return it. def transform_anchors( str, rs ) @log.debug " Transforming anchors" @scanner.string = str.dup text = '' # Scan the whole string until @scanner.eos? if @scanner.scan( /\[/ ) link = ''; linkid = '' depth = 1 startpos = @scanner.pos @log.debug " Found a bracket-open at %d" % startpos # Scan the rest of the tag, allowing unlimited nested []s. If # the scanner runs out of text before the opening bracket is # closed, append the text and return (wasn't a valid anchor). while depth.nonzero? linktext = @scanner.scan_until( /\]|\[/ ) if linktext @log.debug " Found a bracket at depth %d: %p" % [ depth, linktext ] link += linktext # Decrement depth for each closing bracket depth += ( linktext[-1, 1] == ']' ? -1 : 1 ) @log.debug " Depth is now #{depth}" # If there's no more brackets, it must not be an anchor, so # just abort. else @log.debug " Missing closing brace, assuming non-link." link += @scanner.rest @scanner.terminate return text + '[' + link end end link.slice!( -1 ) # Trim final ']' @log.debug " Found leading link %p" % link # Look for a reference-style second part if @scanner.scan( RefLinkIdRegex ) linkid = @scanner[1] linkid = link.dup if linkid.empty? linkid.downcase! @log.debug " Found a linkid: %p" % linkid # If there's a matching link in the link table, build an # anchor tag for it. if rs.urls.key?( linkid ) @log.debug " Found link key in the link table: %p" % rs.urls[linkid] url = escape_md( rs.urls[linkid] ) text += %{#{link}} # If the link referred to doesn't exist, just append the raw # source to the result else @log.debug " Linkid %p not found in link table" % linkid @log.debug " Appending original string instead: " @log.debug "%p" % @scanner.string[ startpos-1 .. @scanner.pos-1 ] text += @scanner.string[ startpos-1 .. @scanner.pos-1 ] end # ...or for an inline style second part elsif @scanner.scan( InlineLinkRegex ) url = @scanner[1] title = @scanner[3] @log.debug " Found an inline link to %p" % url text += %{#{link}} # No linkid part: just append the first part as-is. else @log.debug "No linkid, so no anchor. Appending literal text." text += @scanner.string[ startpos-1 .. @scanner.pos-1 ] end # if linkid # Plain text else @log.debug " Scanning to the next link from %p" % @scanner.rest text += @scanner.scan( /[^\[]+/ ) end end # until @scanner.eos? return text end # Pattern to match strong emphasis in Markdown text BoldRegexp = %r{ (\*\*|__) (\S|\S.*?\S) \1 }x # Pattern to match normal emphasis in Markdown text ItalicRegexp = %r{ (\*|_) (\S|\S.*?\S) \1 }x ### Transform italic- and bold-encoded text in a copy of the specified +str+ ### and return it. def transform_italic_and_bold( str, rs ) @log.debug " Transforming italic and bold" str. gsub( BoldRegexp, %{\\2} ). gsub( ItalicRegexp, %{\\2} ) end ### Transform backticked spans into spans. def transform_code_spans( str, rs ) @log.debug " Transforming code spans" # Set up the string scanner and just return the string unless there's at # least one backtick. @scanner.string = str.dup unless @scanner.exist?( /`/ ) @scanner.terminate @log.debug "No backticks found for code span in %p" % str return str end @log.debug "Transforming code spans in %p" % str # Build the transformed text anew text = '' # Scan to the end of the string until @scanner.eos? # Scan up to an opening backtick if pre = @scanner.scan_until( /.?(?=`)/m ) text += pre @log.debug "Found backtick at %d after '...%s'" % [ @scanner.pos, text[-10, 10] ] # Make a pattern to find the end of the span opener = @scanner.scan( /`+/ ) len = opener.length closer = Regexp.new( opener ) @log.debug "Scanning for end of code span with %p" % closer # Scan until the end of the closing backtick sequence. Chop the # backticks off the resultant string, strip leading and trailing # whitespace, and encode any enitites contained in it. codespan = @scanner.scan_until( closer ) or raise FormatError.new( @scanner.rest[0,20], "No %p found before end" % opener ) @log.debug "Found close of code span at %d: %p" % [ @scanner.pos - len, codespan ] codespan.slice!( -len, len ) text += "%s" % encode_code( codespan.strip, rs ) # If there's no more backticks, just append the rest of the string # and move the scan pointer to the end else text += @scanner.rest @scanner.terminate end end return text end # Next, handle inline images: ![alt text](url "optional title") # Don't forget: encode * and _ InlineImageRegexp = %r{ ( # Whole match = $1 !\[ (.*?) \] # alt text = $2 \([ ]* ? # source url = $3 [ ]* (?: # (["']) # quote char = $4 (.*?) # title = $5 \4 # matching quote [ ]* )? # title is optional \) ) }xs #" # Reference-style images ReferenceImageRegexp = %r{ ( # Whole match = $1 !\[ (.*?) \] # Alt text = $2 [ ]? # Optional space (?:\n[ ]*)? # One optional newline + spaces \[ (.*?) \] # id = $3 ) }xs ### Turn image markup into image tags. def transform_images( str, rs ) @log.debug " Transforming images (%p)" % [str] # Handle reference-style labeled images: ![alt text][id] str. gsub( ReferenceImageRegexp ) {|match| whole, alt, linkid = $1, $2, $3.downcase @log.debug "Matched %p" % match res = nil alt.gsub!( /"/, '"' ) # for shortcut links like ![this][]. linkid = alt.downcase if linkid.empty? if rs.urls.key?( linkid ) url = escape_md( rs.urls[linkid] ) @log.debug "Found url '%s' for linkid '%s' " % [ url, linkid ] # Build the tag result = %{%s}, '>' ). gsub( CodeEscapeRegexp ) {|match| ESCAPE_TABLE[match][:md5]} end ################################################################# ### U T I L I T Y F U N C T I O N S ################################################################# ### Escape any markdown characters in a copy of the given +str+ and return ### it. def escape_md( str ) str. gsub( /\*/, ESCAPE_TABLE['*'][:md5] ). gsub( /_/, ESCAPE_TABLE['_'][:md5] ) end # Matching constructs for tokenizing X/HTML HTMLCommentRegexp = %r{ }mx XMLProcInstRegexp = %r{ <\? .*? \?> }mx MetaTag = Regexp.union( HTMLCommentRegexp, XMLProcInstRegexp ) HTMLTagOpenRegexp = %r{ < [a-z/!$] [^<>]* }imx HTMLTagCloseRegexp = %r{ > }x HTMLTagPart = Regexp.union( HTMLTagOpenRegexp, HTMLTagCloseRegexp ) ### Break the HTML source in +str+ into a series of tokens and return ### them. The tokens are just 2-element Array tuples with a type and the ### actual content. If this function is called with a block, the type and ### text parts of each token will be yielded to it one at a time as they are ### extracted. def tokenize_html( str ) depth = 0 tokens = [] @scanner.string = str.dup type, token = nil, nil until @scanner.eos? @log.debug "Scanning from %p" % @scanner.rest # Match comments and PIs without nesting if (( token = @scanner.scan(MetaTag) )) type = :tag # Do nested matching for HTML tags elsif (( token = @scanner.scan(HTMLTagOpenRegexp) )) tagstart = @scanner.pos @log.debug " Found the start of a plain tag at %d" % tagstart # Start the token with the opening angle depth = 1 type = :tag # Scan the rest of the tag, allowing unlimited nested <>s. If # the scanner runs out of text before the tag is closed, raise # an error. while depth.nonzero? # Scan either an opener or a closer chunk = @scanner.scan( HTMLTagPart ) or raise "Malformed tag at character %d: %p" % [ tagstart, token + @scanner.rest ] @log.debug " Found another part of the tag at depth %d: %p" % [ depth, chunk ] token += chunk # If the last character of the token so far is a closing # angle bracket, decrement the depth. Otherwise increment # it for a nested tag. depth += ( token[-1, 1] == '>' ? -1 : 1 ) @log.debug " Depth is now #{depth}" end # Match text segments else @log.debug " Looking for a chunk of text" type = :text # Scan forward, always matching at least one character to move # the pointer beyond any non-tag '<'. token = @scanner.scan_until( /[^<]+/m ) end @log.debug " type: %p, token: %p" % [ type, token ] # If a block is given, feed it one token at a time. Add the token to # the token list to be returned regardless. if block_given? yield( type, token ) end tokens << [ type, token ] end return tokens end ### Return a copy of +str+ with angle brackets and ampersands HTML-encoded. def encode_html( str ) str.gsub( /&(?!#?[x]?(?:[0-9a-f]+|\w+);)/i, "&" ). gsub( %r{<(?![a-z/?\$!])}i, "<" ) end ### Return one level of line-leading tabs or spaces from a copy of +str+ and ### return it. def outdent( str ) str.gsub( /^(\t|[ ]{1,#{TAB_WIDTH}})/, '') end end # class BlueCloth