crawl.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. """Compare the speed of downloading URLs sequentially vs. using futures."""
  2. import functools
  3. import time
  4. import timeit
  5. import sys
  6. try:
  7. from urllib2 import urlopen
  8. except ImportError:
  9. from urllib.request import urlopen
  10. from concurrent.futures import (as_completed, ThreadPoolExecutor,
  11. ProcessPoolExecutor)
  12. URLS = ['http://www.google.com/',
  13. 'http://www.apple.com/',
  14. 'http://www.ibm.com',
  15. 'http://www.thisurlprobablydoesnotexist.com',
  16. 'http://www.slashdot.org/',
  17. 'http://www.python.org/',
  18. 'http://www.bing.com/',
  19. 'http://www.facebook.com/',
  20. 'http://www.yahoo.com/',
  21. 'http://www.youtube.com/',
  22. 'http://www.blogger.com/']
  23. def load_url(url, timeout):
  24. kwargs = {'timeout': timeout} if sys.version_info >= (2, 6) else {}
  25. return urlopen(url, **kwargs).read()
  26. def download_urls_sequential(urls, timeout=60):
  27. url_to_content = {}
  28. for url in urls:
  29. try:
  30. url_to_content[url] = load_url(url, timeout=timeout)
  31. except:
  32. pass
  33. return url_to_content
  34. def download_urls_with_executor(urls, executor, timeout=60):
  35. try:
  36. url_to_content = {}
  37. future_to_url = dict((executor.submit(load_url, url, timeout), url)
  38. for url in urls)
  39. for future in as_completed(future_to_url):
  40. try:
  41. url_to_content[future_to_url[future]] = future.result()
  42. except:
  43. pass
  44. return url_to_content
  45. finally:
  46. executor.shutdown()
  47. def main():
  48. for name, fn in [('sequential',
  49. functools.partial(download_urls_sequential, URLS)),
  50. ('processes',
  51. functools.partial(download_urls_with_executor,
  52. URLS,
  53. ProcessPoolExecutor(10))),
  54. ('threads',
  55. functools.partial(download_urls_with_executor,
  56. URLS,
  57. ThreadPoolExecutor(10)))]:
  58. sys.stdout.write('%s: ' % name.ljust(12))
  59. start = time.time()
  60. url_map = fn()
  61. sys.stdout.write('%.2f seconds (%d of %d downloaded)\n' %
  62. (time.time() - start, len(url_map), len(URLS)))
  63. if __name__ == '__main__':
  64. main()