david
/
huginn
mirror of https://github.com/huginn/huginn.git


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							require 'open-uri'
require 'hypdf'

module Agents
  class PdfInfoAgent < Agent

    gem_dependency_check { defined?(HyPDF) }

    cannot_be_scheduled!

    description <<-MD
      #{'## Include the `hypdf` gem in your `Gemfile` to use PDFInfo Agents.' if dependencies_missing?}

      In order for this agent to work, you need to have [HyPDF](https://devcenter.heroku.com/articles/hypdf) running and configured.

      It works by acting on events that contain a key `url` in their payload, and runs the [pdfinfo](https://devcenter.heroku.com/articles/hypdf#pdfinfo) command on them.
    MD

    event_description <<-MD
    This will change based on the metadata in the pdf.

      { "Title"=>"Everyday Rails Testing with RSpec", 
        "Author"=>"Aaron Sumner",
        "Creator"=>"LaTeX with hyperref package",
        "Producer"=>"xdvipdfmx (0.7.8)",
        "CreationDate"=>"Fri Aug  2 05",
        "32"=>"50 2013",
        "Tagged"=>"no",
        "Pages"=>"150",
        "Encrypted"=>"no",
        "Page size"=>"612 x 792 pts (letter)",
        "Optimized"=>"no",
        "PDF version"=>"1.5",
        "url": "your url"
      }
    MD

    def working?
      !recent_error_logs?
    end

    def default_options
      {}
    end

    def receive(incoming_events)
      incoming_events.each do |event|
        interpolate_with(event) do
          url_to_scrape = event.payload['url']
          check_url(url_to_scrape, event.payload) if url_to_scrape =~ /^https?:\/\//i
        end
      end
    end

    def check_url(in_url, payload)
      return unless in_url.present?
      Array(in_url).each do |url|
        log "Fetching #{url}"
        info = HyPDF.pdfinfo(open(url))
        create_event :payload => info.merge(payload)
      end
    end

  end
end