ProxyCrawl.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #coding:utf-8
  2. import gevent
  3. from gevent.pool import Pool
  4. from multiprocessing import Queue, Process, Value
  5. import time
  6. import sys
  7. from api.apiServer import start_api_server
  8. from config import THREADNUM, parserList, UPDATE_TIME, MINNUM
  9. from db.DataStore import store_data, sqlhelper
  10. from spider.HtmlDownloader import Html_Downloader
  11. from spider.HtmlPraser import Html_Parser
  12. from validator.Validator import validator, getMyIP,detect_from_db
  13. __author__ = 'qiye'
  14. from gevent import monkey
  15. monkey.patch_all()
  16. '''
  17. 这个类的作用是描述爬虫的逻辑
  18. '''
  19. def startProxyCrawl(queue,db_proxy_num):
  20. crawl = ProxyCrawl(queue,db_proxy_num)
  21. crawl.run()
  22. class ProxyCrawl(object):
  23. proxies = set()
  24. def __init__(self,queue,db_proxy_num):
  25. self.crawl_pool = Pool(THREADNUM)
  26. self.queue = queue
  27. self.db_proxy_num = db_proxy_num
  28. def run(self):
  29. while True:
  30. self.proxies.clear()
  31. str = 'IPProxyPool----->>>>>>>>beginning'
  32. sys.stdout.write(str+"\r\n")
  33. sys.stdout.flush()
  34. proxylist=sqlhelper.select()
  35. myip = getMyIP()
  36. spawns = []
  37. for proxy in proxylist:
  38. spawns.append(gevent.spawn(detect_from_db,myip,proxy,self.proxies))
  39. gevent.joinall(spawns)
  40. self.db_proxy_num.value= len(self.proxies)
  41. str = 'IPProxyPool----->>>>>>>>db exists ip:%d'%len(self.proxies)
  42. if len(self.proxies)<MINNUM:
  43. str +='\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
  44. sys.stdout.write(str+"\r\n")
  45. sys.stdout.flush()
  46. self.crawl_pool.map(self.crawl,parserList)
  47. else:
  48. str +='\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
  49. sys.stdout.write(str+"\r\n")
  50. sys.stdout.flush()
  51. time.sleep(UPDATE_TIME)
  52. def crawl(self,parser):
  53. html_parser = Html_Parser()
  54. for url in parser['urls']:
  55. response = Html_Downloader.download(url)
  56. if response!=None:
  57. proxylist= html_parser.parse(response,parser)
  58. if proxylist != None:
  59. for proxy in proxylist:
  60. proxy_str ='%s:%s'%(proxy['ip'],proxy['port'])
  61. if proxy_str not in self.proxies:
  62. self.proxies.add(proxy_str)
  63. self.queue.put(proxy)
  64. if __name__=="__main__":
  65. DB_PROXY_NUM=Value('i',0)
  66. q1 = Queue()
  67. q2 = Queue()
  68. p0 = Process(target=start_api_server)
  69. p1 = Process(target=startProxyCrawl,args=(q1,DB_PROXY_NUM))
  70. p2 = Process(target=validator,args=(q1,q2))
  71. p3 = Process(target=store_data,args=(q2,DB_PROXY_NUM))
  72. p0.start()
  73. p1.start()
  74. p2.start()
  75. p3.start()
  76. # spider = ProxyCrawl()
  77. # spider.run()