Validator.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. #coding:utf-8
  2. import json
  3. from multiprocessing import Process
  4. import re
  5. import gevent
  6. from lxml import etree
  7. import requests
  8. import time
  9. from config import TEST_URL
  10. import config
  11. from db.DataStore import sqlhelper
  12. from util.exception import Test_URL_Fail
  13. from gevent import monkey
  14. monkey.patch_all()
  15. def detect_from_db(myip,proxy,proxies_set):
  16. proxy_dict = {'ip':proxy[0],'port':proxy[1]}
  17. result = detect_list(myip,proxy_dict)
  18. if result:
  19. if proxy[2]<60000:
  20. score = proxy[2] + 1
  21. else:
  22. score = 60000
  23. proxy_str ='%s:%s'%(proxy[0],proxy[1])
  24. proxies_set.add(proxy_str)
  25. sqlhelper.update({'ip':proxy[0],'port':proxy[1]},{'score':score})
  26. else:
  27. sqlhelper.delete({'ip':proxy[0],'port':proxy[1]})
  28. pass
  29. def validator(queue1,queue2):
  30. tasklist=[]
  31. myip = getMyIP()
  32. while True:
  33. try:
  34. # proxy_dict = {'source':'crawl','data':proxy}
  35. proxy = queue1.get(timeout=10)
  36. tasklist.append(proxy)
  37. if len(tasklist)>500:
  38. p = Process(target=process_start,args=(tasklist,myip,queue2))
  39. p.start()
  40. tasklist=[]
  41. except Exception as e:
  42. if len(tasklist)>0:
  43. p = Process(target=process_start,args=(tasklist,myip,queue2))
  44. p.start()
  45. tasklist=[]
  46. def process_start(tasks,myip,queue2):
  47. spawns = []
  48. for task in tasks:
  49. spawns.append(gevent.spawn(detect_list,myip,task,queue2))
  50. gevent.joinall(spawns)
  51. def detect_list(selfip,proxy,queue2=None):
  52. '''
  53. :param proxy: ip字典
  54. :return:
  55. '''
  56. ip = proxy['ip']
  57. port = proxy['port']
  58. proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
  59. # proxyType = checkProxyType(selfip,proxies)
  60. # if proxyType==3:
  61. # logger.info('failed %s:%s'%(ip,port))
  62. # proxy = None
  63. # queue2.put(proxy)
  64. # return proxy
  65. # else:
  66. # proxy['type']=proxyType
  67. proxy['type']=0
  68. start = time.time()
  69. try:
  70. r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
  71. if not r.ok or r.text.find(ip)==-1:
  72. proxy = None
  73. else:
  74. speed = round(time.time()-start,2)
  75. proxy['speed']=speed
  76. except Exception as e:
  77. proxy = None
  78. if queue2:
  79. queue2.put(proxy)
  80. return proxy
  81. def checkProxyType(selfip,proxies):
  82. '''
  83. 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
  84. :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
  85. :return:
  86. '''
  87. try:
  88. r = requests.get(url=config.TEST_PROXY,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
  89. if r.ok:
  90. root = etree.HTML(r.text)
  91. ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
  92. http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
  93. http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
  94. # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
  95. if ip==selfip:
  96. return 3
  97. if http_x_forwared_for is None and http_via is None:
  98. return 0
  99. if http_via != None and http_x_forwared_for.find(selfip)== -1:
  100. return 1
  101. if http_via != None and http_x_forwared_for.find(selfip)!= -1:
  102. return 2
  103. return 3
  104. except Exception as e:
  105. return 3
  106. def getMyIP():
  107. try:
  108. r = requests.get(url=config.TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT)
  109. pattern = '\d+\.\d+\.\d+\.\d+'
  110. match =re.search(pattern,r.text)
  111. if match:
  112. ip = match.group()
  113. return ip
  114. else:
  115. raise Test_URL_Fail
  116. except Exception as e:
  117. raise Test_URL_Fail
  118. if __name__=='__main__':
  119. getMyIP()
  120. # str="{ip:'61.150.43.121',address:'陕西省西安市 西安电子科技大学'}"
  121. # j = json.dumps(str)
  122. # str = j['ip']
  123. # print str