#coding:utf-8 import gevent from gevent.pool import Pool from multiprocessing import Queue, Process, Value import time import sys from api.apiServer import start_api_server from config import THREADNUM, parserList, UPDATE_TIME, MINNUM from db.DataStore import store_data, sqlhelper from spider.HtmlDownloader import Html_Downloader from spider.HtmlPraser import Html_Parser from validator.Validator import validator, getMyIP,detect_from_db __author__ = 'qiye' from gevent import monkey monkey.patch_all() ''' 这个类的作用是描述爬虫的逻辑 ''' def startProxyCrawl(queue,db_proxy_num): crawl = ProxyCrawl(queue,db_proxy_num) crawl.run() class ProxyCrawl(object): proxies = set() def __init__(self,queue,db_proxy_num): self.crawl_pool = Pool(THREADNUM) self.queue = queue self.db_proxy_num = db_proxy_num def run(self): while True: self.proxies.clear() str = 'IPProxyPool----->>>>>>>>beginning' sys.stdout.write(str+"\r\n") sys.stdout.flush() proxylist=sqlhelper.select() myip = getMyIP() spawns = [] for proxy in proxylist: spawns.append(gevent.spawn(detect_from_db,myip,proxy,self.proxies)) gevent.joinall(spawns) self.db_proxy_num.value= len(self.proxies) str = 'IPProxyPool----->>>>>>>>db exists ip:%d'%len(self.proxies) if len(self.proxies)