david
/
huginn
zrkadlo https://github.com/huginn/huginn.git


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
							require 'open-uri'
require 'hypdf'

module Agents
  class PdfInfoAgent < Agent
    gem_dependency_check { defined?(HyPDF) }

    cannot_be_scheduled!
    no_bulk_receive!

    description <<~MD
      The PDF Info Agent returns the metadata contained within a given PDF file, using HyPDF.

      #{'## Include the `hypdf` gem in your `Gemfile` to use PDFInfo Agents.' if dependencies_missing?}

      In order for this agent to work, you need to have [HyPDF](https://devcenter.heroku.com/articles/hypdf) running and configured.

      It works by acting on events that contain a key `url` in their payload, and runs the [pdfinfo](https://devcenter.heroku.com/articles/hypdf#pdfinfo) command on them.
    MD

    event_description do
      "This will change based on the metadata in the pdf.\n\n    " +
        Utils.pretty_print({
          "Title" => "Everyday Rails Testing with RSpec",
          "Author" => "Aaron Sumner",
          "Creator" => "LaTeX with hyperref package",
          "Producer" => "xdvipdfmx (0.7.8)",
          "CreationDate" => "Fri Aug  2 05",
          "32" => "50 2013",
          "Tagged" => "no",
          "Pages" => "150",
          "Encrypted" => "no",
          "Page size" => "612 x 792 pts (letter)",
          "Optimized" => "no",
          "PDF version" => "1.5",
          "url": "your url"
        })
    end

    def working?
      !recent_error_logs?
    end

    def default_options
      {}
    end

    def receive(incoming_events)
      incoming_events.each do |event|
        interpolate_with(event) do
          url_to_scrape = event.payload['url']
          check_url(url_to_scrape, event.payload) if url_to_scrape =~ /^https?:\/\//i
        end
      end
    end

    def check_url(in_url, payload)
      return unless in_url.present?

      Array(in_url).each do |url|
        log "Fetching #{url}"
        info = HyPDF.pdfinfo(open(url))
        create_event payload: info.merge(payload)
      end
    end
  end
end