html_transformer.rb 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. module Utils
  2. module HtmlTransformer
  3. SINGLE = 1
  4. MULTIPLE = 2
  5. COMMA_SEPARATED = 3
  6. SRCSET = 4
  7. URI_ATTRIBUTES = {
  8. 'a' => { 'href' => SINGLE },
  9. 'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
  10. 'area' => { 'href' => SINGLE },
  11. 'audio' => { 'src' => SINGLE },
  12. 'base' => { 'href' => SINGLE },
  13. 'blockquote' => { 'cite' => SINGLE },
  14. 'body' => { 'background' => SINGLE },
  15. 'button' => { 'formaction' => SINGLE },
  16. 'command' => { 'icon' => SINGLE },
  17. 'del' => { 'cite' => SINGLE },
  18. 'embed' => { 'src' => SINGLE },
  19. 'form' => { 'action' => SINGLE },
  20. 'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
  21. 'head' => { 'profile' => SINGLE },
  22. 'html' => { 'manifest' => SINGLE },
  23. 'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
  24. 'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
  25. 'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
  26. 'ins' => { 'cite' => SINGLE },
  27. 'link' => { 'href' => SINGLE },
  28. 'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
  29. 'q' => { 'cite' => SINGLE },
  30. 'script' => { 'src' => SINGLE },
  31. 'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
  32. 'video' => { 'poster' => SINGLE, 'src' => SINGLE },
  33. }
  34. URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')
  35. module_function
  36. def transform(html, &block)
  37. block or raise ArgumentError, 'block must be given'
  38. case html
  39. when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
  40. doc = Nokogiri.parse(html)
  41. yield doc
  42. doc.to_s
  43. when /\A\s*<(html|head|body)[\s>]/i
  44. # Libxml2 automatically adds DOCTYPE and <html>, so we need to
  45. # skip them.
  46. element_name = $1
  47. doc = Nokogiri::HTML::Document.parse(html)
  48. yield doc
  49. doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
  50. else
  51. doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
  52. yield doc
  53. doc.xpath("/html/body/node()").to_s
  54. end
  55. end
  56. def replace_uris(html, &block)
  57. block or raise ArgumentError, 'block must be given'
  58. transform(html) { |doc|
  59. doc.xpath(URI_ELEMENTS_XPATH).each { |element|
  60. uri_attrs = URI_ATTRIBUTES[element.name] or next
  61. uri_attrs.each { |name, format|
  62. attr = element.attribute(name) or next
  63. case format
  64. when SINGLE
  65. attr.value = block.call(attr.value.strip)
  66. when MULTIPLE
  67. attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
  68. when COMMA_SEPARATED, SRCSET
  69. attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
  70. end
  71. }
  72. }
  73. }
  74. end
  75. end
  76. end