david
/
huginn
mirror of https://github.com/huginn/huginn.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
							require 'jsonpath'
require 'cgi'
require 'addressable/uri'

module Utils
  def self.unindent(s)
    s = s.gsub(/\t/, '  ').chomp
    min = ((s.split("\n").find {|l| l !~ /^\s*$/ })[/^\s+/, 0] || "").length
    if min > 0
      s.gsub(/^#{" " * min}/, "")
    else
      s
    end
  end

  def self.pretty_print(struct, indent = true)
    output = JSON.pretty_generate(struct)
    if indent
      output.gsub(/\n/i, "\n    ")
    else
      output
    end
  end

  def self.normalize_uri(uri)
    begin
      URI(uri)
    rescue URI::Error
      begin
        URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
              unsafe.bytes.each_with_object(String.new) { |uc, s|
                s << sprintf('%%%02X', uc)
              }
            }.force_encoding(Encoding::US_ASCII))
      rescue URI::Error => e
        begin
          auri = Addressable::URI.parse(uri.to_s)
        rescue
          # Do not leak Addressable::URI::InvalidURIError which
          # callers might not expect.
          raise e
        else
          # Addressable::URI#normalize! modifies the query and
          # fragment components beyond escaping unsafe characters, so
          # avoid using it.  Otherwise `?a[]=%2F` would be normalized
          # as `?a%5B%5D=/`, for example.
          auri.site = auri.normalized_site
          auri.path = auri.normalized_path
          URI(auri.to_s)
        end
      end
    end
  end

  def self.interpolate_jsonpaths(value, data, options = {})
    if options[:leading_dollarsign_is_jsonpath] && value[0] == '$'
      Utils.values_at(data, value).first.to_s
    else
      value.gsub(/<[^>]+>/).each { |jsonpath|
        Utils.values_at(data, jsonpath[1..-2]).first.to_s
      }
    end
  end

  def self.recursively_interpolate_jsonpaths(struct, data, options = {})
    case struct
      when Hash
        struct.inject({}) {|memo, (key, value)| memo[key] = recursively_interpolate_jsonpaths(value, data, options); memo }
      when Array
        struct.map {|elem| recursively_interpolate_jsonpaths(elem, data, options) }
      when String
        interpolate_jsonpaths(struct, data, options)
      else
        struct
    end
  end

  def self.value_at(data, path)
    values_at(data, path).first
  end

  def self.values_at(data, path)
    if path =~ /\Aescape /
      path.gsub!(/\Aescape /, '')
      escape = true
    else
      escape = false
    end

    result = JsonPath.new(path).on(data.is_a?(String) ? data : data.to_json)
    if escape
      result.map {|r| CGI::escape r }
    else
      result
    end
  end

  # Output JSON that is ready for inclusion into HTML.  If you simply use to_json on an object, the
  # presence of </script> in the valid JSON can break the page and allow XSS attacks.
  # Optionally, pass `:skip_safe => true` to not call html_safe on the output.
  def self.jsonify(thing, options = {})
    json = thing.to_json.gsub('</', '<\/')
    if !options[:skip_safe]
      json.html_safe
    else
      json
    end
  end

  def self.pretty_jsonify(thing)
    JSON.pretty_generate(thing).gsub('</', '<\/')
  end

  class TupleSorter
    class SortableTuple
      attr_reader :array

      # The <=> method will call orders[n] to determine if the nth element
      # should be compared in descending order.
      def initialize(array, orders = [])
        @array = array
        @orders = orders
      end

      def <=> other
        other = other.array
        @array.each_with_index do |e, i|
          o = other[i]
          case cmp = e <=> o || e.to_s <=> o.to_s
          when 0
            next
          else
            return @orders[i] ? -cmp : cmp
          end
        end
        0
      end
    end

    class << self
      def sort!(array, orders = [])
        array.sort_by! do |e|
          SortableTuple.new(e, orders)
        end
      end
    end
  end

  def self.sort_tuples!(array, orders = [])
    TupleSorter.sort!(array, orders)
  end

  def self.parse_duration(string)
    return nil if string.blank?
    case string.strip
    when /\A(\d+)\.(\w+)\z/
      $1.to_i.send($2.to_s)
    when /\A(\d+)\z/
      $1.to_i
    else
      STDERR.puts "WARNING: Invalid duration format: '#{string.strip}'"
      nil
    end
  end

  def self.if_present(string, method)
    if string.present?
      string.send(method)
    else
      nil
    end
  end

  module HTMLTransformer
    SINGLE = 1
    MULTIPLE = 2
    COMMA_SEPARATED = 3
    SRCSET = 4

    URI_ATTRIBUTES = {
      'a' => { 'href' => SINGLE },
      'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
      'area' => { 'href' => SINGLE },
      'audio' => { 'src' => SINGLE },
      'base' => { 'href' => SINGLE },
      'blockquote' => { 'cite' => SINGLE },
      'body' => { 'background' => SINGLE },
      'button' => { 'formaction' => SINGLE },
      'command' => { 'icon' => SINGLE },
      'del' => { 'cite' => SINGLE },
      'embed' => { 'src' => SINGLE },
      'form' => { 'action' => SINGLE },
      'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
      'head' => { 'profile' => SINGLE },
      'html' => { 'manifest' => SINGLE },
      'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
      'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
      'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
      'ins' => { 'cite' => SINGLE },
      'link' => { 'href' => SINGLE },
      'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
      'q' => { 'cite' => SINGLE },
      'script' => { 'src' => SINGLE },
      'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
      'video' => { 'poster' => SINGLE, 'src' => SINGLE },
    }

    URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')

    module_function

    def transform(html, &block)
      block or raise ArgumentError, 'block must be given'

      case html
      when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
        doc = Nokogiri.parse(html)
        yield doc
        doc.to_s
      when /\A\s*<(html|head|body)[\s>]/i
        # Libxml2 automatically adds DOCTYPE and <html>, so we need to
        # skip them.
        element_name = $1
        doc = Nokogiri::HTML::Document.parse(html)
        yield doc
        doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
      else
        doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
        yield doc
        doc.xpath("/html/body/node()").to_s
      end
    end

    def replace_uris(html, &block)
      block or raise ArgumentError, 'block must be given'

      transform(html) { |doc|
        doc.xpath(URI_ELEMENTS_XPATH).each { |element|
          uri_attrs = URI_ATTRIBUTES[element.name] or next
          uri_attrs.each { |name, format|
            attr = element.attribute(name) or next
            case format
            when SINGLE
              attr.value = block.call(attr.value.strip)
            when MULTIPLE
              attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
            when COMMA_SEPARATED, SRCSET
              attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
            end
          }
        }
      }
    end
  end

  def self.rebase_hrefs(html, base_uri)
    base_uri = normalize_uri(base_uri)
    HTMLTransformer.replace_uris(html) { |url|
      base_uri.merge(normalize_uri(url)).to_s
    }
  end
end