website_agent.rb 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. require 'nokogiri'
  2. require 'typhoeus'
  3. require 'date'
  4. module Agents
  5. class WebsiteAgent < Agent
  6. default_schedule "every_12h"
  7. UNIQUENESS_LOOK_BACK = 200
  8. UNIQUENESS_FACTOR = 3
  9. description <<-MD
  10. The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
  11. Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
  12. `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
  13. The `type` value can be `xml`, `html`, or `json`.
  14. To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
  15. When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `'text': true` or `attr` pointing to an attribute name to grab. An example:
  16. 'extract': {
  17. 'url': { 'css': "#comic img", 'attr': "src" },
  18. 'title': { 'css': "#comic img", 'attr': "title" },
  19. 'body_text': { 'css': "div.main", 'text': true }
  20. }
  21. When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example:
  22. 'extract': {
  23. 'title': { 'path': "results.data[*].title" },
  24. 'description': { 'path': "results.data[*].description" }
  25. }
  26. Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
  27. Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
  28. Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. This is only used to set the "working" status.
  29. Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
  30. Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset.
  31. The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload.
  32. MD
  33. event_description do
  34. "Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options['extract']}"
  35. end
  36. def working?
  37. event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
  38. end
  39. def default_options
  40. {
  41. 'expected_update_period_in_days' => "2",
  42. 'url' => "http://xkcd.com",
  43. 'type' => "html",
  44. 'mode' => "on_change",
  45. 'extract' => {
  46. 'url' => { 'css' => "#comic img", 'attr' => "src" },
  47. 'title' => { 'css' => "#comic img", 'attr' => "alt" },
  48. 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
  49. }
  50. }
  51. end
  52. def validate_options
  53. # Check for required fields
  54. errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
  55. if !options['extract'].present? && extraction_type != "json"
  56. errors.add(:base, "extract is required for all types except json")
  57. end
  58. # Check for optional fields
  59. if options['mode'].present?
  60. errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode'])
  61. end
  62. if options['expected_update_period_in_days'].present?
  63. errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days'])
  64. end
  65. if options['uniqueness_look_back'].present?
  66. errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
  67. end
  68. if (encoding = options['force_encoding']).present?
  69. case encoding
  70. when String
  71. begin
  72. Encoding.find(encoding)
  73. rescue ArgumentError
  74. errors.add(:base, "Unknown encoding: #{encoding.inspect}")
  75. end
  76. else
  77. errors.add(:base, "force_encoding must be a string")
  78. end
  79. end
  80. end
  81. def check
  82. log "Fetching #{options['url']}"
  83. check_url options['url']
  84. end
  85. def check_url(in_url)
  86. hydra = Typhoeus::Hydra.new
  87. request_opts = { :followlocation => true }
  88. request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
  89. requests = []
  90. if in_url.kind_of?(Array)
  91. in_url.each do |url|
  92. requests.push(Typhoeus::Request.new(url, request_opts))
  93. end
  94. else
  95. requests.push(Typhoeus::Request.new(in_url, request_opts))
  96. end
  97. requests.each do |request|
  98. request.on_failure do |response|
  99. error "Failed: #{response.inspect}"
  100. end
  101. request.on_success do |response|
  102. body = response.body
  103. if (encoding = options['force_encoding']).present?
  104. body = body.encode(Encoding::UTF_8, encoding)
  105. end
  106. doc = parse(body)
  107. if extract_full_json?
  108. if store_payload!(previous_payloads(1), doc)
  109. log "Storing new result for '#{name}': #{doc.inspect}"
  110. create_event :payload => doc
  111. end
  112. else
  113. output = {}
  114. options['extract'].each do |name, extraction_details|
  115. if extraction_type == "json"
  116. result = Utils.values_at(doc, extraction_details['path'])
  117. log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
  118. else
  119. case
  120. when css = extraction_details['css']
  121. nodes = doc.css(css)
  122. when xpath = extraction_details['xpath']
  123. nodes = doc.xpath(xpath)
  124. else
  125. error "'css' or 'xpath' is required for HTML or XML extraction"
  126. return
  127. end
  128. unless Nokogiri::XML::NodeSet === nodes
  129. error "The result of HTML/XML extraction was not a NodeSet"
  130. return
  131. end
  132. result = nodes.map { |node|
  133. if extraction_details['attr']
  134. node.attr(extraction_details['attr'])
  135. elsif extraction_details['text']
  136. node.text()
  137. else
  138. error "'attr' or 'text' is required on HTML or XML extraction patterns"
  139. return
  140. end
  141. }
  142. log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
  143. end
  144. output[name] = result
  145. end
  146. num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
  147. if num_unique_lengths.length != 1
  148. error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
  149. return
  150. end
  151. old_events = previous_payloads num_unique_lengths.first
  152. num_unique_lengths.first.times do |index|
  153. result = {}
  154. options['extract'].keys.each do |name|
  155. result[name] = output[name][index]
  156. if name.to_s == 'url'
  157. result[name] = URI.join(request.base_url, result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
  158. end
  159. end
  160. if store_payload!(old_events, result)
  161. log "Storing new parsed result for '#{name}': #{result.inspect}"
  162. create_event :payload => result
  163. end
  164. end
  165. end
  166. end
  167. hydra.queue request
  168. hydra.run
  169. end
  170. end
  171. def receive(incoming_events)
  172. incoming_events.each do |event|
  173. url_to_scrape = event.payload['url']
  174. check_url(url_to_scrape) if url_to_scrape =~ /^https?:\/\//i
  175. end
  176. end
  177. private
  178. # This method returns true if the result should be stored as a new event.
  179. # If mode is set to 'on_change', this method may return false and update an existing
  180. # event to expire further in the future.
  181. def store_payload!(old_events, result)
  182. if !options['mode'].present?
  183. return true
  184. elsif options['mode'].to_s == "all"
  185. return true
  186. elsif options['mode'].to_s == "on_change"
  187. result_json = result.to_json
  188. old_events.each do |old_event|
  189. if old_event.payload.to_json == result_json
  190. old_event.expires_at = new_event_expiration_date
  191. old_event.save!
  192. return false
  193. end
  194. end
  195. return true
  196. end
  197. raise "Illegal options[mode]: " + options['mode'].to_s
  198. end
  199. def previous_payloads(num_events)
  200. if options['uniqueness_look_back'].present?
  201. look_back = options['uniqueness_look_back'].to_i
  202. else
  203. # Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK
  204. look_back = UNIQUENESS_FACTOR * num_events
  205. if look_back < UNIQUENESS_LOOK_BACK
  206. look_back = UNIQUENESS_LOOK_BACK
  207. end
  208. end
  209. events.order("id desc").limit(look_back) if options['mode'].present? && options['mode'].to_s == "on_change"
  210. end
  211. def extract_full_json?
  212. !options['extract'].present? && extraction_type == "json"
  213. end
  214. def extraction_type
  215. (options['type'] || begin
  216. if options['url'] =~ /\.(rss|xml)$/i
  217. "xml"
  218. elsif options['url'] =~ /\.json$/i
  219. "json"
  220. else
  221. "html"
  222. end
  223. end).to_s
  224. end
  225. def parse(data)
  226. case extraction_type
  227. when "xml"
  228. Nokogiri::XML(data)
  229. when "json"
  230. JSON.parse(data)
  231. when "html"
  232. Nokogiri::HTML(data)
  233. else
  234. raise "Unknown extraction type #{extraction_type}"
  235. end
  236. end
  237. def is_positive_integer?(value)
  238. begin
  239. Integer(value) >= 0
  240. rescue
  241. false
  242. end
  243. end
  244. end
  245. end