1
0

utils.rb 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. require 'jsonpath'
  2. require 'cgi'
  3. require 'addressable/uri'
  4. module Utils
  5. def self.unindent(s)
  6. s = s.gsub(/\t/, ' ').chomp
  7. min = ((s.split("\n").find {|l| l !~ /^\s*$/ })[/^\s+/, 0] || "").length
  8. if min > 0
  9. s.gsub(/^#{" " * min}/, "")
  10. else
  11. s
  12. end
  13. end
  14. def self.pretty_print(struct, indent = true)
  15. output = JSON.pretty_generate(struct)
  16. if indent
  17. output.gsub(/\n/i, "\n ")
  18. else
  19. output
  20. end
  21. end
  22. def self.normalize_uri(uri)
  23. begin
  24. URI(uri)
  25. rescue URI::Error
  26. begin
  27. URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
  28. unsafe.bytes.each_with_object(String.new) { |uc, s|
  29. s << sprintf('%%%02X', uc)
  30. }
  31. }.force_encoding(Encoding::US_ASCII))
  32. rescue URI::Error => e
  33. begin
  34. auri = Addressable::URI.parse(uri.to_s)
  35. rescue
  36. # Do not leak Addressable::URI::InvalidURIError which
  37. # callers might not expect.
  38. raise e
  39. else
  40. # Addressable::URI#normalize! modifies the query and
  41. # fragment components beyond escaping unsafe characters, so
  42. # avoid using it. Otherwise `?a[]=%2F` would be normalized
  43. # as `?a%5B%5D=/`, for example.
  44. auri.site = auri.normalized_site
  45. auri.path = auri.normalized_path
  46. URI(auri.to_s)
  47. end
  48. end
  49. end
  50. end
  51. def self.interpolate_jsonpaths(value, data, options = {})
  52. if options[:leading_dollarsign_is_jsonpath] && value[0] == '$'
  53. Utils.values_at(data, value).first.to_s
  54. else
  55. value.gsub(/<[^>]+>/).each { |jsonpath|
  56. Utils.values_at(data, jsonpath[1..-2]).first.to_s
  57. }
  58. end
  59. end
  60. def self.recursively_interpolate_jsonpaths(struct, data, options = {})
  61. case struct
  62. when Hash
  63. struct.inject({}) {|memo, (key, value)| memo[key] = recursively_interpolate_jsonpaths(value, data, options); memo }
  64. when Array
  65. struct.map {|elem| recursively_interpolate_jsonpaths(elem, data, options) }
  66. when String
  67. interpolate_jsonpaths(struct, data, options)
  68. else
  69. struct
  70. end
  71. end
  72. def self.value_at(data, path)
  73. values_at(data, path).first
  74. end
  75. def self.values_at(data, path)
  76. if path =~ /\Aescape /
  77. path.gsub!(/\Aescape /, '')
  78. escape = true
  79. else
  80. escape = false
  81. end
  82. result = JsonPath.new(path).on(data.is_a?(String) ? data : data.to_json)
  83. if escape
  84. result.map {|r| CGI::escape r }
  85. else
  86. result
  87. end
  88. end
  89. # Output JSON that is ready for inclusion into HTML. If you simply use to_json on an object, the
  90. # presence of </script> in the valid JSON can break the page and allow XSS attacks.
  91. # Optionally, pass `:skip_safe => true` to not call html_safe on the output.
  92. def self.jsonify(thing, options = {})
  93. json = thing.to_json.gsub('</', '<\/')
  94. if !options[:skip_safe]
  95. json.html_safe
  96. else
  97. json
  98. end
  99. end
  100. def self.pretty_jsonify(thing)
  101. JSON.pretty_generate(thing).gsub('</', '<\/')
  102. end
  103. class TupleSorter
  104. class SortableTuple
  105. attr_reader :array
  106. # The <=> method will call orders[n] to determine if the nth element
  107. # should be compared in descending order.
  108. def initialize(array, orders = [])
  109. @array = array
  110. @orders = orders
  111. end
  112. def <=> other
  113. other = other.array
  114. @array.each_with_index do |e, i|
  115. o = other[i]
  116. case cmp = e <=> o || e.to_s <=> o.to_s
  117. when 0
  118. next
  119. else
  120. return @orders[i] ? -cmp : cmp
  121. end
  122. end
  123. 0
  124. end
  125. end
  126. class << self
  127. def sort!(array, orders = [])
  128. array.sort_by! do |e|
  129. SortableTuple.new(e, orders)
  130. end
  131. end
  132. end
  133. end
  134. def self.sort_tuples!(array, orders = [])
  135. TupleSorter.sort!(array, orders)
  136. end
  137. def self.parse_duration(string)
  138. return nil if string.blank?
  139. case string.strip
  140. when /\A(\d+)\.(\w+)\z/
  141. $1.to_i.send($2.to_s)
  142. when /\A(\d+)\z/
  143. $1.to_i
  144. else
  145. STDERR.puts "WARNING: Invalid duration format: '#{string.strip}'"
  146. nil
  147. end
  148. end
  149. def self.if_present(string, method)
  150. if string.present?
  151. string.send(method)
  152. else
  153. nil
  154. end
  155. end
  156. module HTMLTransformer
  157. SINGLE = 1
  158. MULTIPLE = 2
  159. COMMA_SEPARATED = 3
  160. SRCSET = 4
  161. URI_ATTRIBUTES = {
  162. 'a' => { 'href' => SINGLE },
  163. 'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
  164. 'area' => { 'href' => SINGLE },
  165. 'audio' => { 'src' => SINGLE },
  166. 'base' => { 'href' => SINGLE },
  167. 'blockquote' => { 'cite' => SINGLE },
  168. 'body' => { 'background' => SINGLE },
  169. 'button' => { 'formaction' => SINGLE },
  170. 'command' => { 'icon' => SINGLE },
  171. 'del' => { 'cite' => SINGLE },
  172. 'embed' => { 'src' => SINGLE },
  173. 'form' => { 'action' => SINGLE },
  174. 'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
  175. 'head' => { 'profile' => SINGLE },
  176. 'html' => { 'manifest' => SINGLE },
  177. 'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
  178. 'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
  179. 'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
  180. 'ins' => { 'cite' => SINGLE },
  181. 'link' => { 'href' => SINGLE },
  182. 'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
  183. 'q' => { 'cite' => SINGLE },
  184. 'script' => { 'src' => SINGLE },
  185. 'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
  186. 'video' => { 'poster' => SINGLE, 'src' => SINGLE },
  187. }
  188. URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')
  189. module_function
  190. def transform(html, &block)
  191. block or raise ArgumentError, 'block must be given'
  192. case html
  193. when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
  194. doc = Nokogiri.parse(html)
  195. yield doc
  196. doc.to_s
  197. when /\A\s*<(html|head|body)[\s>]/i
  198. # Libxml2 automatically adds DOCTYPE and <html>, so we need to
  199. # skip them.
  200. element_name = $1
  201. doc = Nokogiri::HTML::Document.parse(html)
  202. yield doc
  203. doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
  204. else
  205. doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
  206. yield doc
  207. doc.xpath("/html/body/node()").to_s
  208. end
  209. end
  210. def replace_uris(html, &block)
  211. block or raise ArgumentError, 'block must be given'
  212. transform(html) { |doc|
  213. doc.xpath(URI_ELEMENTS_XPATH).each { |element|
  214. uri_attrs = URI_ATTRIBUTES[element.name] or next
  215. uri_attrs.each { |name, format|
  216. attr = element.attribute(name) or next
  217. case format
  218. when SINGLE
  219. attr.value = block.call(attr.value.strip)
  220. when MULTIPLE
  221. attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
  222. when COMMA_SEPARATED, SRCSET
  223. attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
  224. end
  225. }
  226. }
  227. }
  228. end
  229. end
  230. def self.rebase_hrefs(html, base_uri)
  231. base_uri = normalize_uri(base_uri)
  232. HTMLTransformer.replace_uris(html) { |url|
  233. base_uri.merge(normalize_uri(url)).to_s
  234. }
  235. end
  236. end