123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- require 'json'
- require 'uri'
- module Agents
- class PhantomJsCloudAgent < Agent
- include ERB::Util
- include FormConfigurable
- include WebRequestConcern
- can_dry_run!
- default_schedule 'every_12h'
- description <<~MD
- This Agent generates [PhantomJs Cloud](https://phantomjscloud.com/) URLs that can be used to render JavaScript-heavy webpages for content extraction.
- URLs generated by this Agent are formulated in accordance with the [PhantomJs Cloud API](https://phantomjscloud.com/docs/index.html).
- The generated URLs can then be supplied to a Website Agent to fetch and parse the content.
- [Sign up](https://dashboard.phantomjscloud.com/dash.html#/signup) to get an api key, and add it in Huginn credentials.
- Please see the [Huginn Wiki for more info](https://github.com/huginn/huginn/wiki/Browser-Emulation-Using-PhantomJS-Cloud).
- Options:
- * `Api key` - PhantomJs Cloud API Key credential stored in Huginn
- * `Url` - The url to render
- * `Mode` - Create a new `clean` event or `merge` old payload with new values (default: `clean`)
- * `Render type` - Render as html, plain text without html tags, or jpg as screenshot of the page (default: `html`)
- * `Output as json` - Return the page contents and metadata as a JSON object (default: `false`)
- * `Ignore images` - Skip loading of inlined images (default: `false`)
- * `Url agent` - A custom User-Agent name (default: `#{default_user_agent}`)
- * `Wait interval` - Milliseconds to delay rendering after the last resource is finished loading.
- This is useful in case there are any AJAX requests or animations that need to finish up.
- This can safely be set to 0 if you know there are no AJAX or animations you need to wait for (default: `1000`ms)
- As this agent only provides a limited subset of the most commonly used options, you can follow [this guide](https://github.com/huginn/huginn/wiki/Browser-Emulation-Using-PhantomJS-Cloud) to make full use of additional options PhantomJsCloud provides.
- MD
- event_description <<~MD
- Events look like this:
- {
- "url": "..."
- }
- MD
- def default_options
- {
- 'mode' => 'clean',
- 'url' => 'http://xkcd.com',
- 'render_type' => 'html',
- 'output_as_json' => false,
- 'ignore_images' => false,
- 'user_agent' => self.class.default_user_agent,
- 'wait_interval' => '1000'
- }
- end
- form_configurable :mode, type: :array, values: ['clean', 'merge']
- form_configurable :api_key, roles: :completable
- form_configurable :url
- form_configurable :render_type, type: :array, values: ['html', 'plainText', 'jpg']
- form_configurable :output_as_json, type: :boolean
- form_configurable :ignore_images, type: :boolean
- form_configurable :user_agent, type: :text
- form_configurable :wait_interval
- def mode
- interpolated['mode'].presence || default_options['mode']
- end
- def render_type
- interpolated['render_type'].presence || default_options['render_type']
- end
- def output_as_json
- boolify(interpolated['output_as_json'].presence ||
- default_options['output_as_json'])
- end
- def ignore_images
- boolify(interpolated['ignore_images'].presence ||
- default_options['ignore_images'])
- end
- def user_agent
- interpolated['user_agent'].presence || self.class.default_user_agent
- end
- def wait_interval
- interpolated['wait_interval'].presence || default_options['wait_interval']
- end
- def page_request_settings
- prs = {}
- prs[:ignoreImages] = ignore_images if ignore_images
- prs[:userAgent] = user_agent if user_agent.present?
- if wait_interval != default_options['wait_interval']
- prs[:wait_interval] = wait_interval
- end
- prs
- end
- def build_phantom_url(interpolated)
- api_key = interpolated[:api_key]
- page_request_hash = {
- url: interpolated[:url],
- renderType: render_type
- }
- page_request_hash[:outputAsJson] = output_as_json if output_as_json
- page_request_settings_hash = page_request_settings
- if page_request_settings_hash.any?
- page_request_hash[:requestSettings] = page_request_settings_hash
- end
- request = page_request_hash.to_json
- log "Generated request: #{request}"
- encoded = url_encode(request)
- "https://phantomjscloud.com/api/browser/v2/#{api_key}/?request=#{encoded}"
- end
- def check
- phantom_url = build_phantom_url(interpolated)
- create_event payload: { 'url' => phantom_url }
- end
- def receive(incoming_events)
- incoming_events.each do |event|
- interpolate_with(event) do
- existing_payload = interpolated['mode'].to_s == 'merge' ? event.payload : {}
- phantom_url = build_phantom_url(interpolated)
- result = { 'url' => phantom_url }
- create_event payload: existing_payload.merge(result)
- end
- end
- end
- def complete_api_key
- user.user_credentials.map { |c| { text: c.credential_name, id: "{% credential #{c.credential_name} %}" } }
- end
- def working?
- !recent_error_logs? || received_event_without_error?
- end
- def validate_options
- # Check for required fields
- errors.add(:base, 'Url is required') unless options['url'].present?
- errors.add(:base, 'API key (credential) is required') unless options['api_key'].present?
- end
- end
- end
|