phantom_js_cloud_agent.rb 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. require 'json'
  2. require 'uri'
  3. module Agents
  4. class PhantomJsCloudAgent < Agent
  5. include ERB::Util
  6. include FormConfigurable
  7. include WebRequestConcern
  8. can_dry_run!
  9. default_schedule 'every_12h'
  10. description <<~MD
  11. This Agent generates [PhantomJs Cloud](https://phantomjscloud.com/) URLs that can be used to render JavaScript-heavy webpages for content extraction.
  12. URLs generated by this Agent are formulated in accordance with the [PhantomJs Cloud API](https://phantomjscloud.com/docs/index.html).
  13. The generated URLs can then be supplied to a Website Agent to fetch and parse the content.
  14. [Sign up](https://dashboard.phantomjscloud.com/dash.html#/signup) to get an api key, and add it in Huginn credentials.
  15. Please see the [Huginn Wiki for more info](https://github.com/huginn/huginn/wiki/Browser-Emulation-Using-PhantomJS-Cloud).
  16. Options:
  17. * `Api key` - PhantomJs Cloud API Key credential stored in Huginn
  18. * `Url` - The url to render
  19. * `Mode` - Create a new `clean` event or `merge` old payload with new values (default: `clean`)
  20. * `Render type` - Render as html, plain text without html tags, or jpg as screenshot of the page (default: `html`)
  21. * `Output as json` - Return the page contents and metadata as a JSON object (default: `false`)
  22. * `Ignore images` - Skip loading of inlined images (default: `false`)
  23. * `Url agent` - A custom User-Agent name (default: `#{default_user_agent}`)
  24. * `Wait interval` - Milliseconds to delay rendering after the last resource is finished loading.
  25. This is useful in case there are any AJAX requests or animations that need to finish up.
  26. This can safely be set to 0 if you know there are no AJAX or animations you need to wait for (default: `1000`ms)
  27. As this agent only provides a limited subset of the most commonly used options, you can follow [this guide](https://github.com/huginn/huginn/wiki/Browser-Emulation-Using-PhantomJS-Cloud) to make full use of additional options PhantomJsCloud provides.
  28. MD
  29. event_description <<~MD
  30. Events look like this:
  31. {
  32. "url": "..."
  33. }
  34. MD
  35. def default_options
  36. {
  37. 'mode' => 'clean',
  38. 'url' => 'http://xkcd.com',
  39. 'render_type' => 'html',
  40. 'output_as_json' => false,
  41. 'ignore_images' => false,
  42. 'user_agent' => self.class.default_user_agent,
  43. 'wait_interval' => '1000'
  44. }
  45. end
  46. form_configurable :mode, type: :array, values: ['clean', 'merge']
  47. form_configurable :api_key, roles: :completable
  48. form_configurable :url
  49. form_configurable :render_type, type: :array, values: ['html', 'plainText', 'jpg']
  50. form_configurable :output_as_json, type: :boolean
  51. form_configurable :ignore_images, type: :boolean
  52. form_configurable :user_agent, type: :text
  53. form_configurable :wait_interval
  54. def mode
  55. interpolated['mode'].presence || default_options['mode']
  56. end
  57. def render_type
  58. interpolated['render_type'].presence || default_options['render_type']
  59. end
  60. def output_as_json
  61. boolify(interpolated['output_as_json'].presence ||
  62. default_options['output_as_json'])
  63. end
  64. def ignore_images
  65. boolify(interpolated['ignore_images'].presence ||
  66. default_options['ignore_images'])
  67. end
  68. def user_agent
  69. interpolated['user_agent'].presence || self.class.default_user_agent
  70. end
  71. def wait_interval
  72. interpolated['wait_interval'].presence || default_options['wait_interval']
  73. end
  74. def page_request_settings
  75. prs = {}
  76. prs[:ignoreImages] = ignore_images if ignore_images
  77. prs[:userAgent] = user_agent if user_agent.present?
  78. if wait_interval != default_options['wait_interval']
  79. prs[:wait_interval] = wait_interval
  80. end
  81. prs
  82. end
  83. def build_phantom_url(interpolated)
  84. api_key = interpolated[:api_key]
  85. page_request_hash = {
  86. url: interpolated[:url],
  87. renderType: render_type
  88. }
  89. page_request_hash[:outputAsJson] = output_as_json if output_as_json
  90. page_request_settings_hash = page_request_settings
  91. if page_request_settings_hash.any?
  92. page_request_hash[:requestSettings] = page_request_settings_hash
  93. end
  94. request = page_request_hash.to_json
  95. log "Generated request: #{request}"
  96. encoded = url_encode(request)
  97. "https://phantomjscloud.com/api/browser/v2/#{api_key}/?request=#{encoded}"
  98. end
  99. def check
  100. phantom_url = build_phantom_url(interpolated)
  101. create_event payload: { 'url' => phantom_url }
  102. end
  103. def receive(incoming_events)
  104. incoming_events.each do |event|
  105. interpolate_with(event) do
  106. existing_payload = interpolated['mode'].to_s == 'merge' ? event.payload : {}
  107. phantom_url = build_phantom_url(interpolated)
  108. result = { 'url' => phantom_url }
  109. create_event payload: existing_payload.merge(result)
  110. end
  111. end
  112. end
  113. def complete_api_key
  114. user.user_credentials.map { |c| { text: c.credential_name, id: "{% credential #{c.credential_name} %}" } }
  115. end
  116. def working?
  117. !recent_error_logs? || received_event_without_error?
  118. end
  119. def validate_options
  120. # Check for required fields
  121. errors.add(:base, 'Url is required') unless options['url'].present?
  122. errors.add(:base, 'API key (credential) is required') unless options['api_key'].present?
  123. end
  124. end
  125. end