Validator.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # coding:utf-8
  2. import sys
  3. import chardet
  4. from gevent import monkey
  5. monkey.patch_all()
  6. import json
  7. import os
  8. import gevent
  9. import requests
  10. import time
  11. import psutil
  12. from multiprocessing import Process, Queue
  13. import config
  14. from db.DataStore import sqlhelper
  15. from util.exception import Test_URL_Fail
  16. def detect_from_db(myip, proxy, proxies_set):
  17. proxy_dict = {'ip': proxy[0], 'port': proxy[1]}
  18. result = detect_proxy(myip, proxy_dict)
  19. if result:
  20. proxy_str = '%s:%s' % (proxy[0], proxy[1])
  21. proxies_set.add(proxy_str)
  22. else:
  23. if proxy[2] < 1:
  24. sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]})
  25. else:
  26. score = proxy[2]-1
  27. sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score})
  28. proxy_str = '%s:%s' % (proxy[0], proxy[1])
  29. proxies_set.add(proxy_str)
  30. def validator(queue1, queue2, myip):
  31. tasklist = []
  32. proc_pool = {} # 所有进程列表
  33. cntl_q = Queue() # 控制信息队列
  34. while True:
  35. if not cntl_q.empty():
  36. # 处理已结束的进程
  37. try:
  38. pid = cntl_q.get()
  39. proc = proc_pool.pop(pid)
  40. proc_ps = psutil.Process(pid)
  41. proc_ps.kill()
  42. proc_ps.wait()
  43. except Exception as e:
  44. pass
  45. # print(e)
  46. # print(" we are unable to kill pid:%s" % (pid))
  47. try:
  48. # proxy_dict = {'source':'crawl','data':proxy}
  49. if len(proc_pool) >= config.MAX_CHECK_PROCESS:
  50. time.sleep(config.CHECK_WATI_TIME)
  51. continue
  52. proxy = queue1.get()
  53. tasklist.append(proxy)
  54. if len(tasklist) >= config.MAX_CHECK_CONCURRENT_PER_PROCESS:
  55. p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
  56. p.start()
  57. proc_pool[p.pid] = p
  58. tasklist = []
  59. except Exception as e:
  60. if len(tasklist) > 0:
  61. p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
  62. p.start()
  63. proc_pool[p.pid] = p
  64. tasklist = []
  65. def process_start(tasks, myip, queue2, cntl):
  66. spawns = []
  67. for task in tasks:
  68. spawns.append(gevent.spawn(detect_proxy, myip, task, queue2))
  69. gevent.joinall(spawns)
  70. cntl.put(os.getpid()) # 子进程退出是加入控制队列
  71. def detect_proxy(selfip, proxy, queue2=None):
  72. '''
  73. :param proxy: ip字典
  74. :return:
  75. '''
  76. ip = proxy['ip']
  77. port = proxy['port']
  78. proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  79. protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)#checkProxy(selfip, proxies)
  80. if protocol >= 0:
  81. proxy['protocol'] = protocol
  82. proxy['types'] = types
  83. proxy['speed'] = speed
  84. else:
  85. proxy = None
  86. if queue2:
  87. queue2.put(proxy)
  88. return proxy
  89. def checkProxy(selfip, proxies):
  90. '''
  91. 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
  92. :param
  93. :return:
  94. '''
  95. protocol = -1
  96. types = -1
  97. speed = -1
  98. http, http_types, http_speed = _checkHttpProxy(selfip, proxies)
  99. https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False)
  100. if http and https:
  101. protocol = 2
  102. types = http_types
  103. speed = http_speed
  104. elif http:
  105. types = http_types
  106. protocol = 0
  107. speed = http_speed
  108. elif https:
  109. types = https_types
  110. protocol = 1
  111. speed = https_speed
  112. else:
  113. types = -1
  114. protocol = -1
  115. speed = -1
  116. return protocol, types, speed
  117. def _checkHttpProxy(selfip, proxies, isHttp=True):
  118. types = -1
  119. speed = -1
  120. if isHttp:
  121. test_url = config.TEST_HTTP_HEADER
  122. else:
  123. test_url = config.TEST_HTTPS_HEADER
  124. try:
  125. start = time.time()
  126. r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
  127. if r.ok:
  128. speed = round(time.time() - start, 2)
  129. content = json.loads(r.text)
  130. headers = content['headers']
  131. ip = content['origin']
  132. proxy_connection = headers.get('Proxy-Connection', None)
  133. if ',' in ip:
  134. types = 2
  135. elif proxy_connection:
  136. types = 1
  137. else:
  138. types = 0
  139. return True, types, speed
  140. else:
  141. return False, types, speed
  142. except Exception as e:
  143. return False, types, speed
  144. def baidu_check(selfip, proxies):
  145. '''
  146. 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
  147. :param
  148. :return:
  149. '''
  150. protocol = -1
  151. types = -1
  152. speed = -1
  153. # try:
  154. # #http://ip.chinaz.com/getip.aspx挺稳定,可以用来检测ip
  155. # r = requests.get(url=config.TEST_URL, headers=config.get_header(), timeout=config.TIMEOUT,
  156. # proxies=proxies)
  157. # r.encoding = chardet.detect(r.content)['encoding']
  158. # if r.ok:
  159. # if r.text.find(selfip)>0:
  160. # return protocol, types, speed
  161. # else:
  162. # return protocol,types,speed
  163. #
  164. #
  165. # except Exception as e:
  166. # return protocol, types, speed
  167. try:
  168. start = time.time()
  169. r = requests.get(url='https://www.baidu.com', headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
  170. r.encoding = chardet.detect(r.content)['encoding']
  171. if r.ok:
  172. speed = round(time.time() - start, 2)
  173. protocol= 0
  174. types=0
  175. else:
  176. speed = -1
  177. protocol= -1
  178. types=-1
  179. except Exception as e:
  180. speed = -1
  181. protocol = -1
  182. types = -1
  183. return protocol, types, speed
  184. def getMyIP():
  185. try:
  186. r = requests.get(url=config.TEST_IP, headers=config.get_header(), timeout=config.TIMEOUT)
  187. ip = json.loads(r.text)
  188. return ip['origin']
  189. except Exception as e:
  190. raise Test_URL_Fail
  191. if __name__ == '__main__':
  192. ip = '222.186.161.132'
  193. port = 3128
  194. proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  195. _checkHttpProxy(None,proxies)
  196. # getMyIP()
  197. # str="{ip:'61.150.43.121',address:'陕西省西安市 西安电子科技大学'}"
  198. # j = json.dumps(str)
  199. # str = j['ip']
  200. # print str