pdf_info_agent.rb 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. require 'open-uri'
  2. require 'hypdf'
  3. module Agents
  4. class PdfInfoAgent < Agent
  5. gem_dependency_check { defined?(HyPDF) }
  6. cannot_be_scheduled!
  7. description <<-MD
  8. #{'## Include the `hypdf` gem in your `Gemfile` to use PDFInfo Agents.' if dependencies_missing?}
  9. In order for this agent to work, you need to have [HyPDF](https://devcenter.heroku.com/articles/hypdf) running and configured.
  10. It works by acting on events that contain a key `url` in their payload, and runs the [pdfinfo](https://devcenter.heroku.com/articles/hypdf#pdfinfo) command on them.
  11. MD
  12. event_description <<-MD
  13. This will change based on the metadata in the pdf.
  14. { "Title"=>"Everyday Rails Testing with RSpec",
  15. "Author"=>"Aaron Sumner",
  16. "Creator"=>"LaTeX with hyperref package",
  17. "Producer"=>"xdvipdfmx (0.7.8)",
  18. "CreationDate"=>"Fri Aug 2 05",
  19. "32"=>"50 2013",
  20. "Tagged"=>"no",
  21. "Pages"=>"150",
  22. "Encrypted"=>"no",
  23. "Page size"=>"612 x 792 pts (letter)",
  24. "Optimized"=>"no",
  25. "PDF version"=>"1.5",
  26. "url": "your url"
  27. }
  28. MD
  29. def working?
  30. !recent_error_logs?
  31. end
  32. def default_options
  33. {}
  34. end
  35. def receive(incoming_events)
  36. incoming_events.each do |event|
  37. interpolate_with(event) do
  38. url_to_scrape = event.payload['url']
  39. check_url(url_to_scrape, event.payload) if url_to_scrape =~ /^https?:\/\//i
  40. end
  41. end
  42. end
  43. def check_url(in_url, payload)
  44. return unless in_url.present?
  45. Array(in_url).each do |url|
  46. log "Fetching #{url}"
  47. info = HyPDF.pdfinfo(open(url))
  48. create_event :payload => info.merge(payload)
  49. end
  50. end
  51. end
  52. end