|
@@ -171,87 +171,7 @@ module Utils
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
- module HTMLTransformer
|
|
|
|
- SINGLE = 1
|
|
|
|
- MULTIPLE = 2
|
|
|
|
- COMMA_SEPARATED = 3
|
|
|
|
- SRCSET = 4
|
|
|
|
|
|
|
|
- URI_ATTRIBUTES = {
|
|
|
|
- 'a' => { 'href' => SINGLE },
|
|
|
|
- 'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
|
|
|
|
- 'area' => { 'href' => SINGLE },
|
|
|
|
- 'audio' => { 'src' => SINGLE },
|
|
|
|
- 'base' => { 'href' => SINGLE },
|
|
|
|
- 'blockquote' => { 'cite' => SINGLE },
|
|
|
|
- 'body' => { 'background' => SINGLE },
|
|
|
|
- 'button' => { 'formaction' => SINGLE },
|
|
|
|
- 'command' => { 'icon' => SINGLE },
|
|
|
|
- 'del' => { 'cite' => SINGLE },
|
|
|
|
- 'embed' => { 'src' => SINGLE },
|
|
|
|
- 'form' => { 'action' => SINGLE },
|
|
|
|
- 'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
|
|
|
|
- 'head' => { 'profile' => SINGLE },
|
|
|
|
- 'html' => { 'manifest' => SINGLE },
|
|
|
|
- 'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
|
|
|
|
- 'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
|
|
|
|
- 'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
|
|
|
|
- 'ins' => { 'cite' => SINGLE },
|
|
|
|
- 'link' => { 'href' => SINGLE },
|
|
|
|
- 'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
|
|
|
|
- 'q' => { 'cite' => SINGLE },
|
|
|
|
- 'script' => { 'src' => SINGLE },
|
|
|
|
- 'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
|
|
|
|
- 'video' => { 'poster' => SINGLE, 'src' => SINGLE },
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')
|
|
|
|
-
|
|
|
|
- module_function
|
|
|
|
-
|
|
|
|
- def transform(html, &block)
|
|
|
|
- block or raise ArgumentError, 'block must be given'
|
|
|
|
-
|
|
|
|
- case html
|
|
|
|
- when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
|
|
|
|
- doc = Nokogiri.parse(html)
|
|
|
|
- yield doc
|
|
|
|
- doc.to_s
|
|
|
|
- when /\A\s*<(html|head|body)[\s>]/i
|
|
|
|
- # Libxml2 automatically adds DOCTYPE and <html>, so we need to
|
|
|
|
- # skip them.
|
|
|
|
- element_name = $1
|
|
|
|
- doc = Nokogiri::HTML::Document.parse(html)
|
|
|
|
- yield doc
|
|
|
|
- doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
|
|
|
|
- else
|
|
|
|
- doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
|
|
|
|
- yield doc
|
|
|
|
- doc.xpath("/html/body/node()").to_s
|
|
|
|
- end
|
|
|
|
- end
|
|
|
|
-
|
|
|
|
- def replace_uris(html, &block)
|
|
|
|
- block or raise ArgumentError, 'block must be given'
|
|
|
|
-
|
|
|
|
- transform(html) { |doc|
|
|
|
|
- doc.xpath(URI_ELEMENTS_XPATH).each { |element|
|
|
|
|
- uri_attrs = URI_ATTRIBUTES[element.name] or next
|
|
|
|
- uri_attrs.each { |name, format|
|
|
|
|
- attr = element.attribute(name) or next
|
|
|
|
- case format
|
|
|
|
- when SINGLE
|
|
|
|
- attr.value = block.call(attr.value.strip)
|
|
|
|
- when MULTIPLE
|
|
|
|
- attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
|
|
|
|
- when COMMA_SEPARATED, SRCSET
|
|
|
|
- attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
|
|
|
|
- end
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- end
|
|
|
|
- end
|
|
|
|
|
|
|
|
def self.rebase_hrefs(html, base_uri)
|
|
def self.rebase_hrefs(html, base_uri)
|
|
base_uri = normalize_uri(base_uri)
|
|
base_uri = normalize_uri(base_uri)
|