Textile to HTML

Textile to HTML

Revisions made by tamc up to 04:43 Tue 09 Aug 2005

Note, if you wish to edit or undo these revisions, please follow the links from here. You cannot do it from here, becuase someone may have edited the page more recently and because you may need a password to edit the page.

04:43 Tue 09 Aug 2005

  1. 77. [ /^\'/, '‘' ], # single opening
  2. 78. [ /^"/, '“' ], # double opening
  3. 79. [ /\'$/, '’' ], # single closing
  4. 80. [ /\"$/, '”' ], # double closing
  5. 81. [ /\.{3}$/, '\1…' ], # ellipsis
  6. 82. [ '--', '\1—' ], # em dash
  7. 83. [ '->', ' → ' ], # right arrow
  8. 84. [ '-$', ' – ' ], # en dash
  9. 77. [ /^\'/, '‘' ], # single opening
  10. 78. [ /^"/, '“' ], # double opening
  11. 79. [ /\'$/, '’' ], # single closing
  12. 80. [ /\"$/, '”' ], # double closing
  13. 81. [ /\.{3}$/, '\1…' ], # ellipsis
  14. 82. [ '--', '\1—' ], # em dash
  15. 83. [ '->', ' ? ' ], # right arrow
  16. 84. [ '-$', ' – ' ], # en dash
  17. 86. [ '(TM)', '™' ], # trademark
  18. 87. [ '(R)', '®' ], # registered
  19. 88. [ '(C)', '©' ] # copyright
  20. 86. [ '(TM)', '™' ], # trademark
  21. 87. [ '(R)', '®' ], # registered
  22. 88. [ '(C)', '©' ] # copyright
  23. 492. str.gsub!( '&', '&' )
  24. 493. str.gsub!( '"', '"' )
  25. 494. str.gsub!( '<', '<')
  26. 495. str.gsub!( '>', '>')
  27. 492. str.gsub!( '&', '&' )
  28. 493. str.gsub!( '"', '"' )
  29. 494. str.gsub!( '<', '<')
  30. 495. str.gsub!( '>', '>')

04:43 Tue 09 Aug 2005

  1. 0. content moved from [[A re-write of Redcloth textile to html convertor]]
  2. 0. h1. Textile to HTML
  3. 1.
  4. 2. Textile is a markup language that is meant to make it quick and easy to write basic web pages for wikis, blogs and other uses. Redcloth is a ruby class that converts textile to html, written by _why. Details are at http://www.whytheluckystiff.net/ruby/redcloth/ . Unfortunately, due to my own failings, I find it hard to understand _why's code, so I wrote a partial reimplementation bellow. This has the benefit of (for me) being clearer and easier to extend. The drawback is that it doesn't do all that _why's version does and is slower. But it is here in case anyone finds it useful.
  5. 3.
  6. 4. <pre>
  7. 5. <code>
  8. 6. require 'strscan'
  9. 7.
  10. 8. # Note: Does not implement anything beyond that on http://hobix.com/textile/
  11. 9. # i.e. No markdown, link titles, and converting out of range ASCII
  12. 10. # AND IT IS MUCH SLOWER THAN REDCLOTH !!
  13. 11. #
  14. 12. # Bodged together in a few evenings by Tom Counsell (tamc2@cam.ac.uk)
  15. 13. # Feel free to do whatever you like with the code.
  16. 14. #
  17. 15. # Use is the same as for Redcloth (except that none of the options work):
  18. 16. # html = TextileParser.new( "textile string").to_html
  19. 17. #
  20. 18. class TextileParser
  21. 19.
  22. 20. # These are applied first
  23. 21. BLOCK_COMMANDS = [
  24. 22. [ /<pre.*?>/, :pre_tag ],
  25. 23. [ /<code.*?>/, :code_tag ],
  26. 24. [ /<notextile.*?>/, :notextile_tag ],
  27. 25. [ /h(\d)(.*?)\.\s+/i, :heading_block ],
  28. 26. [ /bq(.*?)\.\s+/i, :quote_block ],
  29. 27. [ /(\*+|\#+)\s+/, :list_block ],
  30. 28. [ /table(.*?)\.\s+/i, :table_block ],
  31. 29. [ /\|/i, :unspecified_table_block ],
  32. 30. [ /\{.+?\}\.\s+\|/i, :unspecified_table_block ], # A styled table row
  33. 31. [ /p(.*?)\.\s+/i, :paragraph_block ],
  34. 32. [ /fn(\d+)(.*?)\.\s+/i, :footnote_block ],
  35. 33. [ /\S+/i, :unspecified_block ],
  36. 34. [ /\s+/i, :skip ],
  37. 35. ]
  38. 36.
  39. 37. # Then these
  40. 38. COMMANDS = [
  41. 39. [ /<pre.*?>/, :pre_tag ], # Can be inline as well as in a block
  42. 40. [ /<code.*?>/, :code_tag ], # Can be inline as well as in a block
  43. 41. [ /<notextile.*?>/, :notextile_tag ], # Can be inline as well as in a block
  44. 42. [ /@(\S.*?\S)@/, :quick_escape_code ],
  45. 43. [ /"(.+?)":(\S*\w\/?)/, :textile_link ],
  46. 44. [ /\[(.+?)\]:(\S*\w\/?)/, :textile_link_alias ],
  47. 45. [ /!(.+?)!(\S*)/, :image_link ],
  48. 46. [ /([A-Z0-9]+)\((.*?)\)/, :acronym ],
  49. 47. [ /(\S+?)\[(\d+)\]/, :footnote ]
  50. 48. ]
  51. 49.
  52. 50. # Bit of a bodge, but need a different definition of phrase and whitespace in a table
  53. 51. INLINE_COMMANDS = COMMANDS + [
  54. 52. [ /\S+/i, :phrase ],
  55. 53. [ /\s+/i, :space ],
  56. 54. ]
  57. 55.
  58. 56. TABLE_INLINE_COMMANDS = COMMANDS + [
  59. 57. [ /[^\s\t\r\n\f\|]+/i, :phrase ],
  60. 58. [ /\s+/i, :space ],
  61. 59. ]
  62. 60.
  63. 61. # Then these are applied to phrases from above
  64. 62. PHRASE_MODIFIERS = [
  65. 63. [ '__', 'i' ],
  66. 64. [ '**', 'b' ],
  67. 65. [ '_', 'em' ],
  68. 66. [ '*', 'strong' ],
  69. 67. [ '??', 'cite' ],
  70. 68. [ '-', 'del' ],
  71. 69. [ '+', 'ins' ],
  72. 70. [ '^', 'sup' ],
  73. 71. [ '~', 'sub' ],
  74. 72. [ '%', 'span' ], # How to avoid when people use % as in 3.0% growth?
  75. 73. ].collect! { |regexp,tag| [ /^#{Regexp.escape(regexp)}/, /#{Regexp.escape(regexp)}$/, tag ] }
  76. 74.
  77. 75. # Character substitutions done last to any words
  78. 76. GLYPHS = [
  79. 77. [ /^\'/, '‘' ], # single opening
  80. 78. [ /^"/, '“' ], # double opening
  81. 79. [ /\'$/, '’' ], # single closing
  82. 80. [ /\"$/, '”' ], # double closing
  83. 81. [ /\.{3}$/, '\1…' ], # ellipsis
  84. 82. [ '--', '\1—' ], # em dash
  85. 83. [ '->', ' → ' ], # right arrow
  86. 84. [ '-$', ' – ' ], # en dash
  87. 85.
  88. 86. [ '(TM)', '™' ], # trademark
  89. 87. [ '(R)', '®' ], # registered
  90. 88. [ '(C)', '©' ] # copyright
  91. 89. ]
  92. 90.
  93. 91. # This is just used to give the ouput html a more beautiful layout
  94. 92. # All tags in here will get a newline after they are output
  95. 93. # and the indent for following lines increased by the number
  96. 94. INDENTING_TAGS = { 'ul' => 1,
  97. 95. 'ol' => 1,
  98. 96. 'li' => 0,
  99. 97. 'blockquote' => 1,
  100. 98. 'table' => 1,
  101. 99. 'tr' => 1,
  102. 100. 'td' => 0,
  103. 101. 'th' => 0,
  104. 102. 'p' => 0,
  105. 103. }
  106. 104.
  107. 105. def initialize( text )
  108. 106. @text = text
  109. 107. end
  110. 108.
  111. 109. def to_html( settings = nil )
  112. 110. reset
  113. 111. convert_text
  114. 112. return html.chomp.chomp # the tests don't have any trailing \ns
  115. 113. end
  116. 114.
  117. 115. private
  118. 116.
  119. 117. ## Methods dealing with blocks of text are called first
  120. 118.
  121. 119. def convert_text
  122. 120. until @scanner.eos?
  123. 121. send( BLOCK_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
  124. 122. add_to_html "\n" # Prettier html if extra space between blocks
  125. 123. end
  126. 124. insert_any_link_aliases
  127. 125. html
  128. 126. end
  129. 127.
  130. 128. # These are all the block commands
  131. 129.
  132. 130. def paragraph_block
  133. 131. tag( 'p', parse_attributes( @scanner[1] ) ) do
  134. 132. standard_paragraph
  135. 133. end
  136. 134. end
  137. 135.
  138. 136. def quote_block
  139. 137. tag( "blockquote", parse_attributes( @scanner[1] ) ) do
  140. 138. tag 'p' do
  141. 139. standard_paragraph
  142. 140. end
  143. 141. end
  144. 142. end
  145. 143.
  146. 144. def unspecified_block
  147. 145. @scanner.unscan
  148. 146. tag 'p' do
  149. 147. standard_paragraph
  150. 148. end
  151. 149. end
  152. 150.
  153. 151. def heading_block
  154. 152. tag( "h#{@scanner[1]}", parse_attributes( @scanner[2] ) ) do
  155. 153. standard_line # Assume titles may only be on one line
  156. 154. end
  157. 155. end
  158. 156.
  159. 157. def list_block
  160. 158. ordered = list_ordered? # See what sort of list we have
  161. 159. depth = list_depth
  162. 160. @scanner.unscan # So that the lines can be scanned individually
  163. 161. tag( ordered ? 'ol' : 'ul' ) do
  164. 162. list_line( ordered, depth ) until end_of_list?( depth )
  165. 163. end
  166. 164. end
  167. 165.
  168. 166. def table_block
  169. 167. tag( 'table', parse_attributes( @scanner[1] ) ) do
  170. 168. table_line until end_of_paragraph?
  171. 169. end
  172. 170. end
  173. 171.
  174. 172. def unspecified_table_block
  175. 173. @scanner.unscan
  176. 174. tag( 'table' ) do
  177. 175. table_line until end_of_paragraph?
  178. 176. end
  179. 177. end
  180. 178.
  181. 179. def footnote_block
  182. 180. number = @scanner[1]
  183. 181. attributes = parse_attributes( @scanner[2] )
  184. 182. attributes[:id] = "fn#{number}"
  185. 183. tag 'p', attributes do
  186. 184. add_to_html "<sup>#{number}</sup> "
  187. 185. standard_line until end_of_paragraph?
  188. 186. end
  189. 187. end
  190. 188.
  191. 189. # Now descend into methods dealing with lines of text
  192. 190.
  193. 191. def pre_tag
  194. 192. escape_tag 'pre'
  195. 193. end
  196. 194.
  197. 195. def code_tag
  198. 196. escape_tag 'code'
  199. 197. end
  200. 198.
  201. 199. def notextile_tag
  202. 200. escape_tag 'notextile', false
  203. 201. end
  204. 202.
  205. 203. # This escapes until a matching close tag
  206. 204. def escape_tag( tag, include_tag_in_output = true )
  207. 205. add_to_html( @scanner.matched ) if include_tag_in_output
  208. 206. level = 1
  209. 207. while level > 0
  210. 208. break unless @scanner.scan(/(.*?)(<(\/)?#{tag}.*?>)/m) # Breaks if no closing tag
  211. 209. add_to_html( htmlesc( @scanner[1] || "" ) )
  212. 210. level = level + ( @scanner[3] ? -1 : 1 )
  213. 211. add_to_html( htmlesc( @scanner[2] ) ) unless level == 0
  214. 212. end
  215. 213. add_to_html "</#{tag}>" if include_tag_in_output
  216. 214. end
  217. 215.
  218. 216. def quick_escape_code
  219. 217. tag 'code' do
  220. 218. add_to_html( htmlesc( @scanner[1] ) )
  221. 219. end
  222. 220. end
  223. 221.
  224. 222. def list_line( ordered, depth )
  225. 223. tag 'li' do
  226. 224. @scanner.scan(/(#+|\*+)\s+/)
  227. 225. if ( list_ordered? == ordered ) && ( list_depth == depth )
  228. 226. standard_line
  229. 227. else # Recursive for sub lists
  230. 228. list_block
  231. 229. end
  232. 230. end
  233. 231. end
  234. 232.
  235. 233. def table_line
  236. 234. # Are their row attributes at that start of the line?
  237. 235. attributes = @scanner.scan(/(\{.+?\})\.\s+/) ? parse_attributes(@scanner[1]) : {}
  238. 236. @scanner.scan(/\|/) # Get rid of any leading cell opening
  239. 237. tag( 'tr', attributes ) do
  240. 238. table_cell until end_of_table_line?
  241. 239. end
  242. 240. end
  243. 241.
  244. 242. def standard_paragraph
  245. 243. until end_of_paragraph?
  246. 244. send( INLINE_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
  247. 245. end
  248. 246. end
  249. 247.
  250. 248. def standard_line
  251. 249. until end_of_line?
  252. 250. send( INLINE_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
  253. 251. end
  254. 252. end
  255. 253.
  256. 254. # Now descend into methods dealing with phrases
  257. 255.
  258. 256. def table_cell
  259. 257. # Style defined at start of cell ?
  260. 258. attributes = @scanner.scan(/(_)?(\S*?)\.\s*/) ? parse_attributes(@scanner[2]) : {}
  261. 259. tag( @scanner[1] ? 'th' : 'td', attributes) do
  262. 260. until end_of_table_cell?
  263. 261. send( TABLE_INLINE_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
  264. 262. end
  265. 263. end
  266. 264. end
  267. 265.
  268. 266. def footnote
  269. 267. add_to_html "#{@scanner[1]}<sup><a href=\"#fn#{@scanner[2]}\">#{@scanner[2]}</a></sup>"
  270. 268. end
  271. 269.
  272. 270. def acronym
  273. 271. add_to_html "<acronym title=\"#{@scanner[2]}\">#{@scanner[1]}</acronym>"
  274. 272. end
  275. 273.
  276. 274. def phrase
  277. 275. word = @scanner.matched
  278. 276.
  279. 277. return add_to_html( parse_glyphs( word ) ) unless word =~ /\w+/ # If a word is entirely symbols then we will leave it in peace.
  280. 278.
  281. 279. # Open tags
  282. 280. PHRASE_MODIFIERS.each do |start_r, end_r, tag|
  283. 281. if word =~ start_r
  284. 282. word = $' # The bit after the match
  285. 283. # Look for matching brackets that indicate there are attributes
  286. 284. if word =~ /(\(.+?\)|\{.+?\}|\[.+?\])/
  287. 285. open_tag( tag, parse_attributes( $1 ) )
  288. 286. word = $'
  289. 287. else
  290. 288. open_tag tag
  291. 289. end
  292. 290. break
  293. 291. end
  294. 292. end
  295. 293.
  296. 294. # Close tags
  297. 295. end_tag = nil
  298. 296. PHRASE_MODIFIERS.each do |start_r, end_r, tag|
  299. 297. if word =~ end_r
  300. 298. end_tag = tag
  301. 299. word = $` # The bit before the match
  302. 300. break
  303. 301. end
  304. 302. end
  305. 303. add_to_html parse_glyphs( word )
  306. 304. close_tag( end_tag ) if end_tag
  307. 305. end
  308. 306.
  309. 307. def space
  310. 308. add_to_html @scanner.matched
  311. 309. end
  312. 310.
  313. 311. def image_link
  314. 312. @scanner.matched =~ /^!([<>]*)(.*?)(!|\((.*?)\)!)($|(:(.+?)$))/
  315. 313. alignment, src, title, url = $1, $2, $4, $7
  316. 314. attributes = {}
  317. 315. attributes[:style] = 'float:right' if alignment == '>'
  318. 316. attributes[:style] = 'float:left' if alignment == '<'
  319. 317.
  320. 318. attributes[:src] = src
  321. 319. attributes[:alt] = attributes[ :title ] = title if title
  322. 320. if url
  323. 321. tag 'a', { :href => url } do
  324. 322. open_tag 'img', attributes, true
  325. 323. end
  326. 324. else
  327. 325. open_tag 'img', attributes, true
  328. 326. end
  329. 327. end
  330. 328.
  331. 329. def skip
  332. 330. # Do nothing !
  333. 331. end
  334. 332.
  335. 333. def textile_link
  336. 334. add_to_html "<a href=\"#{@scanner[2]}\">#{@scanner[1]}</a>"
  337. 335. end
  338. 336.
  339. 337. def textile_link_alias
  340. 338. # These are saved for later resubstitution
  341. 339. @aliases[ @scanner[1] ] = @scanner[2]
  342. 340. end
  343. 341.
  344. 342. # These feels clunky, and is done last
  345. 343. def insert_any_link_aliases
  346. 344. @aliases.each do |als, href|
  347. 345. html.gsub!( /href="#{als}"/, "href=\"#{href}\"" )
  348. 346. end
  349. 347. end
  350. 348.
  351. 349. # These are helper methods that make sure html is properly closed and indented
  352. 350.
  353. 351. def tag( tag, attributes = {} )
  354. 352. open_tag( tag, attributes )
  355. 353. yield
  356. 354. close_tag tag
  357. 355. end
  358. 356.
  359. 357. def close_tag( tag = :all_tags )
  360. 358. # Check the tag has been opened
  361. 359. return unless open_tags.include?( tag ) || ( tag == :all_tags )
  362. 360.
  363. 361. # Close all tags up to that tag (in case one was not closed)
  364. 362. until open_tags.empty?
  365. 363. open_tag = open_tags.pop
  366. 364. # This is just stuff to make the html look pretty
  367. 365. if (indent = INDENTING_TAGS[ open_tag ] )
  368. 366. if indent == 0
  369. 367. add_to_html "</#{open_tag}>"
  370. 368. add_to_html "\n"
  371. 369. else
  372. 370. add_to_html "\n" unless html =~ /\n$/
  373. 371. @indent -= indent
  374. 372. add_to_html( "\t" * @indent )
  375. 373. add_to_html "</#{open_tag}>"
  376. 374. add_to_html "\n"
  377. 375. end
  378. 376. else
  379. 377. add_to_html "</#{open_tag}>"
  380. 378. end
  381. 379. return if open_tag == tag
  382. 380. end
  383. 381. end
  384. 382.
  385. 383. def open_tag( tag, attributes = {}, no_close_tag = false )
  386. 384. add_to_html( "\t" * @indent )
  387. 385.
  388. 386. add_to_html "<#{tag}"
  389. 387.
  390. 388. attributes.each { |key, value| add_to_html( " #{key.to_s}=\"#{value.to_s}\"" ) }
  391. 389.
  392. 390. if no_close_tag
  393. 391. add_to_html " />"
  394. 392. return
  395. 393. end
  396. 394.
  397. 395. add_to_html ">"
  398. 396.
  399. 397. if (indent = INDENTING_TAGS[ tag ] )
  400. 398. add_to_html "\n" unless indent == 0
  401. 399. @indent += indent
  402. 400. end
  403. 401. open_tags << tag
  404. 402. end
  405. 403.
  406. 404. def open_tags
  407. 405. @open_tags ||= []
  408. 406. end
  409. 407.
  410. 408. def parse_attributes( attribute_text )
  411. 409. return {} unless attribute_text && attribute_text != ""
  412. 410. a = { :style => "" }
  413. 411.
  414. 412. # The hand-entered classes, ids, styles and langs
  415. 413. # These are replaced with "" so their content cannot be matched below
  416. 414. a[:lang] = $1 if attribute_text =~ /\[(.+?)\]/
  417. 415. a[:class] = $1 if attribute_text.sub!(/\((.+?)\)/,'')
  418. 416. a[:class], a[:id] = $1, $2 if a[:class] =~ /^(.*?)#(.*)$/
  419. 417. a[:style] << "#{$1};" if attribute_text.sub!(/\{(.+?)\}/,'')
  420. 418.
  421. 419. # Various padding and indents
  422. 420. a[:style] << "padding-left:#{ $1.length }em;" if attribute_text =~ /(\(+)/
  423. 421. a[:style] << "padding-right:#{ $1.length }em;" if attribute_text =~ /(\)+)/
  424. 422.
  425. 423. # The various alignments
  426. 424. a[:style] << "text-align:left;" if attribute_text =~ /<(?!>)/
  427. 425. a[:style] << "text-align:right;" if attribute_text =~ /(?!<)>/
  428. 426. a[:style] << "text-align:justify;" if attribute_text =~ /<>/
  429. 427. a[:style] << "text-align:center;" if attribute_text =~ /=/
  430. 428.
  431. 429. #Various column spans on tables
  432. 430. a[:colspan] = $1 if attribute_text =~ /\\(\d+)/
  433. 431. a[:rowspan] = $1 if attribute_text =~ /\/(\d+)/
  434. 432.
  435. 433. #Vertical alignments on tables
  436. 434. a[:style] << "vertical-align:top;" if attribute_text =~ /\^/
  437. 435. a[:style] << "vertical-align:bottom;" if attribute_text =~ /\~/
  438. 436.
  439. 437. # Get rid of any empty attributes before returning
  440. 438. a.delete_if { |k,v| !v || (v == "") }
  441. 439. end
  442. 440.
  443. 441. def parse_glyphs( word )
  444. 442. GLYPHS.each do |regexp,replacement|
  445. 443. word.gsub!( regexp, replacement )
  446. 444. end
  447. 445. word
  448. 446. end
  449. 447.
  450. 448. # Now some helper methods for spotting the ends of sections
  451. 449.
  452. 450. def end_of_paragraph?
  453. 451. return true if @scanner.eos?
  454. 452. @scanner.scan(/\n{2,}/)
  455. 453. end
  456. 454.
  457. 455. def end_of_list?( depth )
  458. 456. return true if @scanner.eos?
  459. 457. return true unless @scanner.check(/(#+|\*+)\s+/) # Not a list any more
  460. 458. return true if list_depth < depth # End of this sub list
  461. 459. @scanner.scan(/\n{2,}/)
  462. 460. end
  463. 461.
  464. 462. def end_of_line?
  465. 463. return true if @scanner.eos?
  466. 464. return true if @scanner.check(/\n{2,}/)
  467. 465. @scanner.scan(/\n/)
  468. 466. end
  469. 467.
  470. 468. def end_of_table_line?
  471. 469. return true if @scanner.eos?
  472. 470. return true if @scanner.check(/\n{2,}/)
  473. 471. @scanner.scan(/\|\s*\n/)
  474. 472. end
  475. 473.
  476. 474. def end_of_table_cell?
  477. 475. return true if @scanner.eos?
  478. 476. return true if @scanner.check(/\n/)
  479. 477. return true if @scanner.check(/\|\s*\n/)
  480. 478. @scanner.scan(/\|/)
  481. 479. end
  482. 480.
  483. 481. # Now some random helper methods for decoding
  484. 482.
  485. 483. def list_ordered?
  486. 484. @scanner.matched[0,1] == '#'
  487. 485. end
  488. 486.
  489. 487. def list_depth
  490. 488. @scanner[1].size
  491. 489. end
  492. 490.
  493. 491. def htmlesc( str )
  494. 492. str.gsub!( '&', '&' )
  495. 493. str.gsub!( '"', '"' )
  496. 494. str.gsub!( '<', '<')
  497. 495. str.gsub!( '>', '>')
  498. 496. str
  499. 497. end
  500. 498.
  501. 499. # Now the low level matching functions
  502. 500.
  503. 501. def add_to_html( object )
  504. 502. @html << object.to_s
  505. 503. end
  506. 504.
  507. 505. def html
  508. 506. @html
  509. 507. end
  510. 508.
  511. 509. def reset
  512. 510. @html, @scanner = "" , StringScanner.new( @text )
  513. 511. @aliases = {}
  514. 512. @indent = 0
  515. 513. end
  516. 514. end
  517. 515. </code>
  518. 516. </pre>

04:43 Tue 09 Aug 2005

  1. 0. content moved from [[A re-write of Redcloth textile to html convertor]]
View, Edit or see all changes to this page.