Validator.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. # coding:utf-8
  2. import json
  3. from multiprocessing import Process
  4. import gevent
  5. import requests
  6. import time
  7. import config
  8. from db.DataStore import sqlhelper
  9. from util.exception import Test_URL_Fail
  10. from gevent import monkey
  11. monkey.patch_all()
  12. def detect_from_db(myip, proxy, proxies_set):
  13. proxy_dict = {'ip': proxy[0], 'port': proxy[1]}
  14. result = detect_proxy(myip, proxy_dict)
  15. if result:
  16. if proxy[2] < 60000:
  17. score = proxy[2] + 1
  18. else:
  19. score = 60000
  20. proxy_str = '%s:%s' % (proxy[0], proxy[1])
  21. proxies_set.add(proxy_str)
  22. sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score})
  23. else:
  24. sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]})
  25. pass
  26. def validator(queue1, queue2, myip):
  27. tasklist = []
  28. while True:
  29. try:
  30. # proxy_dict = {'source':'crawl','data':proxy}
  31. proxy = queue1.get(timeout=10)
  32. tasklist.append(proxy)
  33. if len(tasklist) > 500:
  34. p = Process(target=process_start, args=(tasklist, myip, queue2))
  35. p.start()
  36. tasklist = []
  37. except Exception, e:
  38. if len(tasklist) > 0:
  39. p = Process(target=process_start, args=(tasklist, myip, queue2))
  40. p.start()
  41. tasklist = []
  42. def process_start(tasks, myip, queue2):
  43. spawns = []
  44. for task in tasks:
  45. spawns.append(gevent.spawn(detect_proxy, myip, task, queue2))
  46. gevent.joinall(spawns)
  47. def detect_proxy(selfip, proxy, queue2=None):
  48. '''
  49. :param proxy: ip字典
  50. :return:
  51. '''
  52. ip = proxy['ip']
  53. port = proxy['port']
  54. proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  55. protocol, types, speed = checkProxy(selfip, proxies)
  56. if protocol > 0:
  57. proxy['protocol'] = protocol
  58. proxy['type'] = types
  59. proxy['speed'] = speed
  60. else:
  61. proxy = None
  62. if queue2:
  63. queue2.put(proxy)
  64. return proxy
  65. def checkProxy(selfip, proxies):
  66. '''
  67. 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
  68. :param
  69. :return:
  70. '''
  71. protocol = -1
  72. types = -1
  73. speed = -1
  74. http, http_types, http_speed = _checkHttpProxy(selfip, proxies)
  75. https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False)
  76. if http and https:
  77. protocol = 2
  78. types = http_types
  79. speed = http_speed
  80. elif http:
  81. types = http_types
  82. protocol = 0
  83. speed = http_speed
  84. elif https:
  85. types = https_types
  86. protocol = 1
  87. speed = https_speed
  88. else:
  89. types = -1
  90. protocol = -1
  91. speed = -1
  92. return protocol, types, speed
  93. def _checkHttpProxy(selfip, proxies, isHttp=True):
  94. types = -1
  95. speed = -1
  96. if isHttp:
  97. test_url = config.TEST_HTTP_HEADER
  98. else:
  99. test_url = config.TEST_HTTPS_HEADER
  100. try:
  101. start = time.time()
  102. r = requests.get(url=test_url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
  103. if r.ok:
  104. speed = round(time.time() - start, 2)
  105. content = json.loads(r.text)
  106. headers = content[u'headers']
  107. ip = content[u'origin']
  108. x_forwarded_for = headers.get(u'X-Forwarded-For', None)
  109. x_real_ip = headers.get(u'X-Real-Ip', None)
  110. if selfip in ip or ',' in ip:
  111. return False, types, speed
  112. elif x_forwarded_for is None and x_real_ip is None:
  113. types = 0
  114. elif selfip not in x_forwarded_for and selfip not in x_real_ip:
  115. types = 1
  116. else:
  117. types = 2
  118. return True, types, speed
  119. else:
  120. return False, types, speed
  121. except Exception, e:
  122. return False, types, speed
  123. def getMyIP():
  124. try:
  125. r = requests.get(url=config.TEST_IP, headers=config.HEADER, timeout=config.TIMEOUT)
  126. ip = json.loads(r.text)
  127. return ip['origin']
  128. except Exception, e:
  129. raise Test_URL_Fail
  130. if __name__ == '__main__':
  131. myip = getMyIP()
  132. ip,port = "61.152.81.193",9100
  133. proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  134. protocol, types, speed = checkProxy(myip, proxies)
  135. # ip = '124.88.67.81'
  136. # port = '80'
  137. # proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  138. # r = requests.get(url=config.TEST_HTTP_HEADER, headers=config.HEADER, timeout=config.TIMEOUT,proxies=proxies)
  139. # json = json.loads(r.text)
  140. # print json['headers']
  141. # print json['origin']
  142. # getMyIP()
  143. # j = json.dumps(str)
  144. # str = j['ip']
  145. # print str