Forráskód Böngészése

分成两个版本支持py2和py3

qiyeboy 8 éve
szülő
commit
76d0aeae37
69 módosított fájl, 1605 hozzáadás és 5 törlés
  1. 0 0
      IPProxyPool_py2/IPProxy.py
  2. 0 0
      IPProxyPool_py2/api/__init__.py
  3. 0 0
      IPProxyPool_py2/api/apiServer.py
  4. 0 0
      IPProxyPool_py2/config.py
  5. BIN
      IPProxyPool_py2/data/proxy.db
  6. 0 0
      IPProxyPool_py2/data/qqwry.dat
  7. 0 0
      IPProxyPool_py2/db/DataStore.py
  8. 0 0
      IPProxyPool_py2/db/ISqlHelper.py
  9. 0 0
      IPProxyPool_py2/db/MongoHelper.py
  10. 0 0
      IPProxyPool_py2/db/RedisHelper.py
  11. 0 0
      IPProxyPool_py2/db/SqlHelper.py
  12. 0 0
      IPProxyPool_py2/db/__init__.py
  13. 0 0
      IPProxyPool_py2/logging.conf
  14. 0 0
      IPProxyPool_py2/spider/HtmlDownLoader.py
  15. 0 0
      IPProxyPool_py2/spider/HtmlPraser.py
  16. 0 0
      IPProxyPool_py2/spider/ProxyCrawl.py
  17. 0 0
      IPProxyPool_py2/spider/__init__.py
  18. 1 0
      IPProxyPool_py2/start.bat
  19. 0 0
      IPProxyPool_py2/test/__init__.py
  20. 0 0
      IPProxyPool_py2/test/test.py
  21. 0 0
      IPProxyPool_py2/test/testIPType.py
  22. 0 0
      IPProxyPool_py2/test/testbase64.py
  23. 0 0
      IPProxyPool_py2/test/testhttpserver.py
  24. 0 0
      IPProxyPool_py2/test/testlist.py
  25. 0 0
      IPProxyPool_py2/test/testlxml.py
  26. 0 0
      IPProxyPool_py2/test/testqueue.py
  27. 0 0
      IPProxyPool_py2/test/testsql.py
  28. 0 0
      IPProxyPool_py2/util/IPAddress.py
  29. 0 0
      IPProxyPool_py2/util/__init__.py
  30. 0 0
      IPProxyPool_py2/util/exception.py
  31. 0 0
      IPProxyPool_py2/util/logger.py
  32. 0 0
      IPProxyPool_py2/validator/Validator.py
  33. 0 0
      IPProxyPool_py2/validator/__init__.py
  34. 32 0
      IPProxyPool_py3/IPProxy.py
  35. 1 0
      IPProxyPool_py3/api/__init__.py
  36. 43 0
      IPProxyPool_py3/api/apiServer.py
  37. 190 0
      IPProxyPool_py3/config.py
  38. BIN
      IPProxyPool_py3/data/proxy.db
  39. BIN
      IPProxyPool_py3/data/qqwry.dat
  40. 49 0
      IPProxyPool_py3/db/DataStore.py
  41. 18 0
      IPProxyPool_py3/db/ISqlHelper.py
  42. 58 0
      IPProxyPool_py3/db/MongoHelper.py
  43. 30 0
      IPProxyPool_py3/db/RedisHelper.py
  44. 140 0
      IPProxyPool_py3/db/SqlHelper.py
  45. 1 0
      IPProxyPool_py3/db/__init__.py
  46. 65 0
      IPProxyPool_py3/logging.conf
  47. 72 0
      IPProxyPool_py3/spider/HtmlDownloader.py
  48. 179 0
      IPProxyPool_py3/spider/HtmlPraser.py
  49. 95 0
      IPProxyPool_py3/spider/ProxyCrawl.py
  50. 1 0
      IPProxyPool_py3/spider/__init__.py
  51. 1 0
      IPProxyPool_py3/start.bat
  52. 1 0
      IPProxyPool_py3/test/__init__.py
  53. 15 0
      IPProxyPool_py3/test/test.py
  54. 44 0
      IPProxyPool_py3/test/testIPType.py
  55. 10 0
      IPProxyPool_py3/test/testbase64.py
  56. 43 0
      IPProxyPool_py3/test/testhttpserver.py
  57. 39 0
      IPProxyPool_py3/test/testlist.py
  58. 127 0
      IPProxyPool_py3/test/testlxml.py
  59. 8 0
      IPProxyPool_py3/test/testqueue.py
  60. 12 0
      IPProxyPool_py3/test/testsql.py
  61. 145 0
      IPProxyPool_py3/util/IPAddress.py
  62. 4 0
      IPProxyPool_py3/util/__init__.py
  63. 16 0
      IPProxyPool_py3/util/exception.py
  64. 10 0
      IPProxyPool_py3/util/logger.py
  65. 152 0
      IPProxyPool_py3/validator/Validator.py
  66. 1 0
      IPProxyPool_py3/validator/__init__.py
  67. 2 4
      README.md
  68. BIN
      data/proxy.db
  69. 0 1
      start.bat

+ 0 - 0
IPProxy.py → IPProxyPool_py2/IPProxy.py


+ 0 - 0
api/__init__.py → IPProxyPool_py2/api/__init__.py


+ 0 - 0
api/apiServer.py → IPProxyPool_py2/api/apiServer.py


+ 0 - 0
config.py → IPProxyPool_py2/config.py


BIN
IPProxyPool_py2/data/proxy.db


+ 0 - 0
data/qqwry.dat → IPProxyPool_py2/data/qqwry.dat


+ 0 - 0
db/DataStore.py → IPProxyPool_py2/db/DataStore.py


+ 0 - 0
db/ISqlHelper.py → IPProxyPool_py2/db/ISqlHelper.py


+ 0 - 0
db/MongoHelper.py → IPProxyPool_py2/db/MongoHelper.py


+ 0 - 0
db/RedisHelper.py → IPProxyPool_py2/db/RedisHelper.py


+ 0 - 0
db/SqlHelper.py → IPProxyPool_py2/db/SqlHelper.py


+ 0 - 0
db/__init__.py → IPProxyPool_py2/db/__init__.py


+ 0 - 0
logging.conf → IPProxyPool_py2/logging.conf


+ 0 - 0
spider/HtmlDownLoader.py → IPProxyPool_py2/spider/HtmlDownLoader.py


+ 0 - 0
spider/HtmlPraser.py → IPProxyPool_py2/spider/HtmlPraser.py


+ 0 - 0
spider/ProxyCrawl.py → IPProxyPool_py2/spider/ProxyCrawl.py


+ 0 - 0
spider/__init__.py → IPProxyPool_py2/spider/__init__.py


+ 1 - 0
IPProxyPool_py2/start.bat

@@ -0,0 +1 @@
+python IPProxy.py

+ 0 - 0
test/__init__.py → IPProxyPool_py2/test/__init__.py


+ 0 - 0
test/test.py → IPProxyPool_py2/test/test.py


+ 0 - 0
test/testIPType.py → IPProxyPool_py2/test/testIPType.py


+ 0 - 0
test/testbase64.py → IPProxyPool_py2/test/testbase64.py


+ 0 - 0
test/testhttpserver.py → IPProxyPool_py2/test/testhttpserver.py


+ 0 - 0
test/testlist.py → IPProxyPool_py2/test/testlist.py


+ 0 - 0
test/testlxml.py → IPProxyPool_py2/test/testlxml.py


+ 0 - 0
test/testqueue.py → IPProxyPool_py2/test/testqueue.py


+ 0 - 0
test/testsql.py → IPProxyPool_py2/test/testsql.py


+ 0 - 0
util/IPAddress.py → IPProxyPool_py2/util/IPAddress.py


+ 0 - 0
util/__init__.py → IPProxyPool_py2/util/__init__.py


+ 0 - 0
util/exception.py → IPProxyPool_py2/util/exception.py


+ 0 - 0
util/logger.py → IPProxyPool_py2/util/logger.py


+ 0 - 0
validator/Validator.py → IPProxyPool_py2/validator/Validator.py


+ 0 - 0
validator/__init__.py → IPProxyPool_py2/validator/__init__.py


+ 32 - 0
IPProxyPool_py3/IPProxy.py

@@ -0,0 +1,32 @@
+#coding:utf-8
+from multiprocessing import Value, Queue, Process
+from api.apiServer import start_api_server
+import sys
+from db.DataStore import store_data
+from spider.ProxyCrawl import startProxyCrawl
+from validator.Validator import validator
+#import imp
+#imp.reload(sys)
+#sys.setdefaultencoding('utf8')
+
+
+
+
+if __name__=="__main__":
+    DB_PROXY_NUM=Value('i',0)
+    q1 = Queue()
+    q2 = Queue()
+    p0 = Process(target=start_api_server)
+    p1 = Process(target=startProxyCrawl,args=(q1,DB_PROXY_NUM))
+    p2 = Process(target=validator,args=(q1,q2))
+    p3 = Process(target=store_data,args=(q2,DB_PROXY_NUM))
+    p0.start()
+    p1.start()
+    p2.start()
+    p3.start()
+
+
+
+
+
+

+ 1 - 0
IPProxyPool_py3/api/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 43 - 0
IPProxyPool_py3/api/apiServer.py

@@ -0,0 +1,43 @@
+#coding:utf-8
+'''
+定义几个关键字,count type,protocol,country,area,
+'''
+import json
+import sys
+import web
+import config
+from db.DataStore import sqlhelper
+from db.SqlHelper import Proxy
+
+urls = (
+  '/', 'select',
+  '/delete','delete'
+)
+
+def start_api_server():
+    sys.argv.append('0.0.0.0:%s'%config.API_PORT)
+    app = web.application(urls, globals())
+    app.run()
+
+class select(object):
+
+    def GET(self):
+        inputs = web.input()
+        json_result = json.dumps(sqlhelper.select(inputs.get('count',None),inputs))
+        return json_result
+
+
+
+class delete(object):
+    params = {}
+    def GET(self):
+        inputs = web.input()
+        json_result = json.dumps(sqlhelper.delete(inputs))
+        return json_result
+
+
+
+if __name__=='__main__':
+    sys.argv.append('0.0.0.0:8000')
+    app = web.application(urls, globals())
+    app.run()

+ 190 - 0
IPProxyPool_py3/config.py

@@ -0,0 +1,190 @@
+#coding:utf-8
+'''
+定义规则 urls:url列表
+         type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
+         patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应
+'''
+from multiprocessing import Value
+import os
+import random
+
+'''
+ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
+ speed(连接速度)
+'''
+parserList = [
+        {
+            'urls': ['http://www.66ip.cn/%s.html'% n for n in ['index']+list(range(2,12))],
+            'type':'xpath',
+            'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''}
+        },
+        {
+            'urls': ['http://www.66ip.cn/areaindex_%s/%s.html'%(m,n) for m in range(1,35) for n in range(1,10)],
+            'type':'xpath',
+            'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''}
+        },
+        {
+            'urls':['http://cn-proxy.com/','http://cn-proxy.com/archives/218'],
+            'type':'xpath',
+            'pattern':".//table[@class='sortable']/tbody/tr",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'','protocol':''}
+
+        },
+        {
+            'urls':['http://www.mimiip.com/gngao/%s'% n for n in range(1,10)],
+            'type':'xpath',
+            'pattern':".//table[@class='list']/tr",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'','protocol':''}
+
+        },
+        {
+            'urls':['https://proxy-list.org/english/index.php?p=%s'%n for n in range(1,10)],
+            'type':'module',
+            'moduleName':'proxy_listPraser',
+            'pattern':'Proxy\(.+\)',
+            'position':{'ip':0,'port':-1,'type':-1,'protocol':2}
+
+        },
+        {
+            'urls':['http://incloak.com/proxy-list/%s#list'%n for n in (['']+['?start=%s'%(64*m) for m in range(1,10)])],
+            'type':'xpath',
+            'pattern':".//table[@class='proxy__t']/tbody/tr",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'','protocol':''}
+
+        },
+        {
+            'urls': ['http://www.kuaidaili.com/proxylist/%s/'% n for n in range(1,11)],
+            'type': 'xpath',
+            'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
+        },
+        {
+            'urls': ['http://www.kuaidaili.com/free/%s/%s/'% (m,n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in range(1,11)],
+            'type':'xpath',
+            'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
+        },
+        {
+            'urls': ['http://www.cz88.net/proxy/%s'% m for m in ['index.shtml']+['http_%s.shtml' % n for n in range(2, 11)]],
+            'type':'xpath',
+            'pattern':".//*[@id='boxright']/div/ul/li[position()>1]",
+            'position':{'ip':'./div[1]','port':'./div[2]','type':'./div[3]','protocol':''}
+
+        },
+        {
+            'urls': ['http://www.ip181.com/daili/%s.html'% n for n in range(1, 11)],
+            'type':'xpath',
+            'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
+            'position':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
+
+        },
+        {
+            'urls': ['http://www.xicidaili.com/%s/%s'%(m,n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8) ],
+            'type':'xpath',
+            'pattern': ".//*[@id='ip_list']/tr[position()>1]",
+            'position':{'ip':'./td[2]','port':'./td[3]','type':'./td[5]','protocol':'./td[6]'}
+        },
+        {
+            'urls':['http://www.cnproxy.com/proxy%s.html'% i for i in range(1,11)],
+            'type':'module',
+            'moduleName':'CnproxyPraser',
+            'pattern':r'<tr><td>(\d+\.\d+\.\d+\.\d+)<SCRIPT type=text/javascript>document.write\(\"\:\"(.+)\)</SCRIPT></td><td>(HTTP|SOCKS4)\s*',
+            'position':{'ip':0,'port':1,'type':-1,'protocol':2}
+        }
+        ]
+'''
+数据库的配置
+'''
+DB_CONFIG={
+
+    'DB_CONNECT_TYPE':'sqlalchemy',#'pymongo'sqlalchemy
+    # 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
+    'DB_CONNECT_STRING':'sqlite:///'+os.path.dirname(__file__)+'/data/proxy.db'
+    #DB_CONNECT_STRING = 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
+
+
+}
+CHINA_AREA=['河北','山东','辽宁','黑龙江','吉林'
+    ,'甘肃','青海','河南','江苏','湖北','湖南',
+            '江西','浙江','广东','云南','福建',
+            '台湾','海南','山西','四川','陕西',
+            '贵州','安徽','重庆','北京','上海','天津','广西','内蒙','西藏','新疆','宁夏','香港','澳门']
+QQWRY_PATH=os.path.dirname(__file__)+"/data/qqwry.dat"
+
+THREADNUM = 5
+API_PORT=8000
+'''
+爬虫爬取和检测ip的设置条件
+不需要检测ip是否已经存在,因为会定时清理
+'''
+UPDATE_TIME=60*60#每一个小时检测一次是否有代理ip失效
+MINNUM = 40 #当有效的ip值小于一个时 需要启动爬虫进行爬取
+MAXTIME = 3*24*60 #当爬取存储开始一直使用的最大时间,如果超过这个时间,都删除
+
+TIMEOUT = 5#socket延时
+
+
+
+'''
+反爬虫的设置
+'''
+'''
+重试次数
+'''
+RETRY_TIME=3
+
+
+'''
+USER_AGENTS 随机头信息
+'''
+USER_AGENTS = [
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
+    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
+    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
+    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
+    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
+    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
+    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
+    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
+    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
+    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
+]
+
+HEADER = {
+    'User-Agent': random.choice(USER_AGENTS),
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Connection': 'keep-alive',
+    'Accept-Encoding': 'gzip, deflate',
+}
+
+TEST_URL='http://ip.chinaz.com/getip.aspx'
+# #添加的检测关键字,修复测试的代理是否能真正的访问到目的网址
+# TEST_KEY = '站长工具'
+TEST_PROXY='http://www.stilllistener.com/checkpoint1/test11/'

BIN
IPProxyPool_py3/data/proxy.db


BIN
IPProxyPool_py3/data/qqwry.dat


+ 49 - 0
IPProxyPool_py3/db/DataStore.py

@@ -0,0 +1,49 @@
+#coding:utf-8
+import sys
+from config import DB_CONFIG
+from util.exception import Con_DB_Fail
+
+
+try:
+    if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo':
+        from db.MongoHelper import MongoHelper as SqlHelper
+    else:
+        from db.SqlHelper import SqlHelper as SqlHelper
+    sqlhelper = SqlHelper()
+    sqlhelper.init_db()
+except Exception as e:
+    raise Con_DB_Fail
+
+
+def store_data(queue2,db_proxy_num):
+    '''
+    读取队列中的数据,写入数据库中
+    :param queue2:
+    :return:
+    '''
+    successNum = 0
+    failNum = 0
+    while True:
+        try:
+            proxy = queue2.get(timeout=300)
+            if proxy:
+
+                sqlhelper.insert(proxy)
+                successNum += 1
+            else:
+                failNum += 1
+            str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum,failNum)
+            sys.stdout.write(str+"\r")
+            sys.stdout.flush()
+        except BaseException as e:
+
+            if db_proxy_num.value != 0:
+                successNum += db_proxy_num.value
+                db_proxy_num.value=0
+                str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum,failNum)
+                sys.stdout.write(str+"\r")
+                sys.stdout.flush()
+                successNum = 0
+                failNum = 0
+
+

+ 18 - 0
IPProxyPool_py3/db/ISqlHelper.py

@@ -0,0 +1,18 @@
+#coding:utf-8
+
+class ISqlHelper(object):
+    params = {'ip':None,'port':None,'types':None,'protocol':None,'country':None,'area':None}
+    def init_db(self):
+        raise NotImplemented
+    def drop_db(self):
+        raise NotImplemented
+
+    def insert(self,value=None):
+        raise NotImplemented
+
+    def delete(self, conditions=None):
+        raise NotImplemented
+    def update(self, conditions=None,value=None):
+        raise NotImplemented
+    def select(self, count=None,conditions=None):
+        raise NotImplemented

+ 58 - 0
IPProxyPool_py3/db/MongoHelper.py

@@ -0,0 +1,58 @@
+import pymongo
+from config import DB_CONFIG
+
+from db.ISqlHelper import ISqlHelper
+
+
+class MongoHelper(ISqlHelper):
+
+    def __init__(self):
+        self.client = pymongo.MongoClient(DB_CONFIG['DB_CONNECT_STRING'])
+
+    def init_db(self):
+        self.db = self.client.proxy
+        self.proxys = self.db.proxys
+
+
+    def drop_db(self):
+        self.client.drop_database(self.db)
+
+
+    def insert(self,value=None):
+      if value:
+          proxy = dict(ip=value['ip'],port=value['port'],types=value['types'],protocol=value['protocol'],country = value['country'],
+                       area=value['area'],speed=value['speed'],score=0)
+          self.proxys.insert(proxy)
+
+
+
+    def delete(self, conditions=None):
+        if conditions:
+            self.proxys.remove(conditions)
+            return ('deleteNum','ok')
+        else:
+            return ('deleteNum','None')
+
+
+    def update(self, conditions=None,value=None):
+        # update({"UserName":"libing"},{"$set":{"Email":"libing@126.com","Password":"123"}})
+        if conditions and value:
+            self.proxys.update(conditions,{"$set":value})
+            return {'updateNum':'ok'}
+        else:
+            return {'updateNum':'fail'}
+
+    def select(self, count=None,conditions=None):
+        if count:
+            count = int(count)
+        else:
+            count=0
+        items =self.proxys.find(filter = conditions,limit = count).sort([("speed",pymongo.ASCENDING),("score",pymongo.DESCENDING)])
+        results = []
+        for item in items:
+            result = (item['ip'],item['port'],item['score'])
+            results.append(result)
+        return results
+
+
+

+ 30 - 0
IPProxyPool_py3/db/RedisHelper.py

@@ -0,0 +1,30 @@
+#coding:utf-8
+from db.ISqlHelper import ISqlHelper
+
+
+class RedisHelper(ISqlHelper):
+
+    def __init__(self):
+        pass
+    def init_db(self):
+        pass
+
+
+    def drop_db(self):
+        pass
+
+
+    def insert(self,value):
+        pass
+
+
+    def delete(self, conditions):
+        pass
+
+    def update(self, conditions,value):
+        pass
+
+    def select(self, count=None,conditions=[]):
+        pass
+
+

+ 140 - 0
IPProxyPool_py3/db/SqlHelper.py

@@ -0,0 +1,140 @@
+#coding:utf-8
+import datetime
+from sqlalchemy import Column, Integer, String, DateTime,  Numeric, create_engine, VARCHAR
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+from config import DB_CONFIG
+
+from db.ISqlHelper import ISqlHelper
+
+'''
+sql操作的基类
+包括ip,端口,types类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
+ speed(连接速度)
+'''
+
+BaseModel = declarative_base()
+class Proxy(BaseModel):
+    __tablename__='proxys'
+    id = Column(Integer, primary_key=True,autoincrement=True)
+    ip = Column(VARCHAR(16), nullable=False)
+    port = Column(Integer, nullable=False)
+    types = Column(Integer, nullable=False)
+    protocol = Column(Integer, nullable=False,default=0)
+    country = Column(VARCHAR(100), nullable=False)
+    area = Column(VARCHAR(100), nullable=False)
+    updatetime = Column(DateTime(),default=datetime.datetime.utcnow)
+    speed = Column(Numeric(5,2),nullable=False)
+    score = Column(Integer, nullable=False,default=0)
+
+
+class SqlHelper(ISqlHelper):
+    params =  {'ip':Proxy.ip,'port':Proxy.port,'types':Proxy.types,'protocol':Proxy.protocol,'country':Proxy.country,'area':Proxy.area,'score':Proxy.score}
+    def __init__(self):
+        if 'sqlite' in DB_CONFIG['DB_CONNECT_STRING']:
+            connect_args={'check_same_thread':False}
+            self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'],echo=False,connect_args=connect_args)
+        else:
+            self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'],echo=False)
+        DB_Session = sessionmaker(bind=self.engine)
+        self.session = DB_Session()
+
+    def init_db(self):
+         BaseModel.metadata.create_all(self.engine)
+    def drop_db(self):
+         BaseModel.metadata.drop_all(self.engine)
+
+
+    def insert(self,value):
+        proxy = Proxy(ip=value['ip'],port=value['port'],types=value['types'],protocol=value['protocol'],country = value['country'],
+                       area=value['area'],speed=value['speed'])
+        self.session.add(proxy)
+        self.session.commit()
+
+
+    def delete(self, conditions=None):
+        if conditions:
+            conditon_list = []
+            for key in list(conditions.keys()):
+                if self.params.get(key,None):
+                    conditon_list.append(self.params.get(key)==conditions.get(key))
+            conditions = conditon_list
+            query = self.session.query(Proxy)
+            for condition in conditions:
+                query = query.filter(condition)
+            deleteNum = query.delete()
+            self.session.commit()
+        else:
+            deleteNum = 0
+        return ('deleteNum',deleteNum)
+
+
+    def update(self, conditions=None,value=None):
+        '''
+        conditions的格式是个字典。类似self.params
+        :param conditions:
+        :param value:也是个字典:{'ip':192.168.0.1}
+        :return:
+        '''
+        if conditions and value:
+            conditon_list = []
+            for key in list(conditions.keys()):
+                if self.params.get(key,None):
+                    conditon_list.append(self.params.get(key)==conditions.get(key))
+            conditions = conditon_list
+            query = self.session.query(Proxy)
+            for condition in conditions:
+                query = query.filter(condition)
+            updatevalue = {}
+            for key in list(value.keys()):
+                if self.params.get(key,None):
+                    updatevalue[self.params.get(key,None)]=value.get(key)
+            updateNum = query.update(updatevalue)
+            self.session.commit()
+        else:
+            updateNum=0
+        return {'updateNum':updateNum}
+
+
+    def select(self, count=None,conditions=None):
+        '''
+        conditions的格式是个字典。类似self.params
+        :param count:
+        :param conditions:
+        :return:
+        '''
+        if conditions:
+            conditon_list = []
+            for key in list(conditions.keys()):
+                if self.params.get(key,None):
+                    conditon_list.append(self.params.get(key)==conditions.get(key))
+            conditions = conditon_list
+        else:
+            conditions=[]
+
+        query = self.session.query(Proxy.ip,Proxy.port,Proxy.score)
+        if len(conditions)>0 and count:
+            for condition in conditions:
+                query = query.filter(condition)
+            return query.order_by(Proxy.score.desc(),Proxy.speed).limit(count).all()
+        elif count:
+            return query.order_by(Proxy.score.desc(),Proxy.speed).limit(count).all()
+        elif len(conditions)>0:
+            for condition in conditions:
+                query = query.filter(condition)
+            return query.order_by(Proxy.score.desc(),Proxy.speed).all()
+        else:
+            return query.order_by(Proxy.score.desc(),Proxy.speed).all()
+
+
+
+    def close(self):
+        pass
+
+if __name__=='__main__':
+
+    sqlhelper = SqlHelper()
+    sqlhelper.init_db()
+    proxy = {'ip':'192.168.1.1','port':80,'type':0,'protocol':0,'country':'中国','area':'广州','speed':11.123}
+    sqlhelper.insert(proxy)
+

+ 1 - 0
IPProxyPool_py3/db/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 65 - 0
IPProxyPool_py3/logging.conf

@@ -0,0 +1,65 @@
+[loggers]
+keys=root,api,data,db,spider,validator,download
+
+[logger_root]
+level=INFO
+handlers=screen
+
+[logger_api]
+level=INFO
+handlers=screen
+qualname=api
+propagate=0
+
+[logger_util]
+level=INFO
+handlers=screen
+qualname=util
+propagate=0
+
+[logger_download]
+level=INFO
+handlers=screen
+qualname=download
+propagate=0
+
+[logger_data]
+level=DEBUG
+handlers=screen
+qualname=data
+propagate=0
+
+[logger_db]
+level=DEBUG
+handlers=screen
+qualname=db
+propagate=0
+
+[logger_spider]
+level=INFO
+handlers=screen
+qualname=spider
+propagate=0
+
+[logger_validator]
+level=INFO
+handlers=screen
+qualname=validator
+propagate=0
+
+[handlers]
+keys=screen
+
+[handler_screen]
+class=logging.StreamHandler
+formatter=pretty
+level=DEBUG
+args=(sys.stderr, )
+
+[formatters]
+keys=pretty
+
+[formatter_pretty]
+format= %(module)s %(asctime)s %(levelname)s %(lineno)d %(message)s
+datefmt= %Y-%m-%d %H:%M:%S
+class=logging.Formatter

+ 72 - 0
IPProxyPool_py3/spider/HtmlDownloader.py

@@ -0,0 +1,72 @@
+#coding:utf-8
+
+import random
+import config
+import json
+from db.DataStore import sqlhelper
+
+__author__ = 'qiye'
+
+import requests
+import chardet
+class Html_Downloader(object):
+
+    @classmethod
+    def download(self,url):
+        count = 0#重试次数
+        r=''
+        try:
+            r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT)
+            r.encoding =chardet.detect(r.content)['encoding']
+            while count< config.RETRY_TIME:
+                if (not r.ok) or len(r.content)<500 :
+                    proxylist = sqlhelper.select(10)
+                    proxy = random.choice(proxylist)
+                    ip = proxy[0]
+                    port = proxy[1]
+                    proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
+                    try:
+                        r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+                        r.encoding =chardet.detect(r.content)['encoding']
+                        count += 1
+                    except Exception as e:
+                         count += 1
+
+
+                else:
+                    return r.text
+
+            return None
+
+        except Exception as e:
+            while count< config.RETRY_TIME:
+                if r==''or (not r.ok) or len(r.content)<500 :
+                    try:
+                        proxylist = sqlhelper.select(10)
+                        proxy = random.choice(proxylist)
+                        ip = proxy[0]
+                        port = proxy[1]
+                        proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
+                        try:
+                            r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+                            r.encoding =chardet.detect(r.content)['encoding']
+                            count += 1
+                        except Exception as e:
+                             count += 1
+
+                    except Exception as e:
+                        return None
+
+                else:
+                    return r.text
+
+            return None
+
+
+
+
+
+
+
+
+

+ 179 - 0
IPProxyPool_py3/spider/HtmlPraser.py

@@ -0,0 +1,179 @@
+#coding:utf-8
+import base64
+from config import QQWRY_PATH, CHINA_AREA
+
+from util.IPAddress import IPAddresss
+import re
+
+__author__ = 'qiye'
+from lxml import etree
+class Html_Parser(object):
+
+    def __init__(self):
+        self.ips = IPAddresss(QQWRY_PATH)
+    def parse(self,response,parser):
+        '''
+
+        :param response: 响应
+        :param type: 解析方式
+        :return:
+        '''
+        if parser['type']=='xpath':
+            return self.XpathPraser(response,parser)
+        elif parser['type']=='regular':
+            return self.RegularPraser(response,parser)
+        elif parser['type']=='module':
+            return getattr(self,parser['moduleName'],None)(response,parser)
+        else:
+            return None
+
+    def AuthCountry(self,addr):
+        '''
+        用来判断地址是哪个国家的
+        :param addr:
+        :return:
+        '''
+        for area in CHINA_AREA:
+            if addr.find(area)!=-1:
+                return True
+        return False
+
+
+
+    def XpathPraser(self,response,parser):
+        '''
+        针对xpath方式进行解析
+        :param response:
+        :param parser:
+        :return:
+        '''
+        # print response
+        proxylist=[]
+        root = etree.HTML(response)
+        proxys = root.xpath(parser['pattern'])
+        # print proxys
+        for proxy in proxys:
+            # print parser['postion']['ip']
+            try:
+                ip = proxy.xpath(parser['position']['ip'])[0].text
+                port = proxy.xpath(parser['position']['port'])[0].text
+                # type = proxy.xpath(parser['postion']['type'])[0].text
+                # # print ip,port,type
+                # if type.find(u'高匿')!=-1:
+                #     type = 0
+                # else:
+                #     type = 1
+                # protocol=''
+                # if len(parser['postion']['protocol']) > 0:
+                #     protocol = proxy.xpath(parser['postion']['protocol'])[0].text
+                #     if protocol.lower().find('https')!=-1:
+                #         protocol = 1
+                #     else:
+                #         protocol = 0
+                # else:
+                type=0
+                protocol = 0
+                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
+                country = ''
+                area = ''
+                if addr.find('省')!=-1 or self.AuthCountry(addr):
+                    country = '中国'
+                    area = addr
+                else:
+                    country = addr
+                    area = ''
+            except Exception as e:
+                continue
+            # updatetime = datetime.datetime.now()
+            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
+
+            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
+            proxy ={'ip':ip,'port':int(port),'types':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
+            proxylist.append(proxy)
+        return proxylist
+
+    def RegularPraser(self,response,parser):
+        '''
+        针对正则表达式进行解析
+        :param response:
+        :param parser:
+        :return:
+        '''
+        proxylist=[]
+        pattern = re.compile(parser['pattern'])
+        matchs = pattern.findall(response)
+        if matchs !=None:
+            for match in matchs:
+                ip = match[parser['position']['ip']]
+                port = match[parser['position']['port']]
+                #网站的类型一直不靠谱所以还是默认,之后会检测
+                type =0
+                # if parser['postion']['protocol'] > 0:
+                #     protocol = match[parser['postion']['protocol']]
+                #     if protocol.lower().find('https')!=-1:
+                #         protocol = 1
+                #     else:
+                #         protocol = 0
+                # else:
+                protocol = 0
+                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
+                country = ''
+                area = ''
+                if addr.find('省')!=-1 or self.AuthCountry(addr):
+                    country = '中国'
+                    area = addr
+                else:
+                    country = addr
+                    area = ''
+                proxy ={'ip':ip,'port':port,'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
+
+                proxylist.append(proxy)
+            return proxylist
+
+
+    def CnproxyPraser(self,response,parser):
+        proxylist = self.RegularPraser(response,parser)
+        chardict ={'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1'}
+
+        for proxy in proxylist:
+            port = proxy['port']
+            new_port = ''
+            for i in range(len(port)):
+                if port[i]!='+':
+                   new_port += chardict[port[i]]
+            new_port = int(new_port)
+            proxy['port'] =new_port
+        return proxylist
+
+
+    def proxy_listPraser(self,response,parser):
+        proxylist=[]
+        pattern = re.compile(parser['pattern'])
+        matchs = pattern.findall(response)
+        if matchs:
+            for match in matchs:
+                ip_port = base64.b64decode(match.replace("Proxy('","").replace("')",""))
+                ip = ip_port.split(':')[0]
+                port = ip_port.split(':')[1]
+                type =0
+                protocol = 0
+                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
+                country = ''
+                area = ''
+                if addr.find('省')!=-1 or self.AuthCountry(addr):
+                    country = '中国'
+                    area = addr
+                else:
+                    country = addr
+                    area = ''
+                proxy ={'ip':ip,'port':int(port),'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
+
+                proxylist.append(proxy)
+            return proxylist
+
+
+
+
+
+
+

+ 95 - 0
IPProxyPool_py3/spider/ProxyCrawl.py

@@ -0,0 +1,95 @@
+#coding:utf-8
+import gevent
+from gevent.pool import Pool
+from multiprocessing import Queue, Process, Value
+import time
+import sys
+from api.apiServer import start_api_server
+from config import THREADNUM, parserList, UPDATE_TIME, MINNUM
+from db.DataStore import store_data, sqlhelper
+from spider.HtmlDownloader import Html_Downloader
+from spider.HtmlPraser import Html_Parser
+from validator.Validator import validator, getMyIP,detect_from_db
+
+
+__author__ = 'qiye'
+from gevent import monkey
+monkey.patch_all()
+'''
+这个类的作用是描述爬虫的逻辑
+'''
+
+def startProxyCrawl(queue,db_proxy_num):
+    crawl = ProxyCrawl(queue,db_proxy_num)
+    crawl.run()
+
+class ProxyCrawl(object):
+    proxies = set()
+    def __init__(self,queue,db_proxy_num):
+        self.crawl_pool = Pool(THREADNUM)
+        self.queue = queue
+        self.db_proxy_num = db_proxy_num
+
+
+    def run(self):
+        while True:
+            self.proxies.clear()
+            str = 'IPProxyPool----->>>>>>>>beginning'
+            sys.stdout.write(str+"\r\n")
+            sys.stdout.flush()
+            proxylist=sqlhelper.select()
+            myip = getMyIP()
+            spawns = []
+            for proxy in proxylist:
+                spawns.append(gevent.spawn(detect_from_db,myip,proxy,self.proxies))
+            gevent.joinall(spawns)
+            self.db_proxy_num.value= len(self.proxies)
+            str = 'IPProxyPool----->>>>>>>>db exists ip:%d'%len(self.proxies)
+
+            if len(self.proxies)<MINNUM:
+                str +='\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
+                sys.stdout.write(str+"\r\n")
+                sys.stdout.flush()
+                self.crawl_pool.map(self.crawl,parserList)
+            else:
+                str +='\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
+                sys.stdout.write(str+"\r\n")
+                sys.stdout.flush()
+
+            time.sleep(UPDATE_TIME)
+
+
+
+    def crawl(self,parser):
+        html_parser = Html_Parser()
+        for url in parser['urls']:
+           response = Html_Downloader.download(url)
+           if response!=None:
+               proxylist= html_parser.parse(response,parser)
+               if proxylist != None:
+                    for proxy in proxylist:
+                        proxy_str ='%s:%s'%(proxy['ip'],proxy['port'])
+                        if proxy_str not in self.proxies:
+                            self.proxies.add(proxy_str)
+                            self.queue.put(proxy)
+
+
+
+
+if __name__=="__main__":
+    DB_PROXY_NUM=Value('i',0)
+    q1 = Queue()
+    q2 = Queue()
+    p0 = Process(target=start_api_server)
+    p1 = Process(target=startProxyCrawl,args=(q1,DB_PROXY_NUM))
+    p2 = Process(target=validator,args=(q1,q2))
+    p3 = Process(target=store_data,args=(q2,DB_PROXY_NUM))
+
+    p0.start()
+    p1.start()
+    p2.start()
+    p3.start()
+
+
+    # spider = ProxyCrawl()
+    # spider.run()

+ 1 - 0
IPProxyPool_py3/spider/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 1 - 0
IPProxyPool_py3/start.bat

@@ -0,0 +1 @@
+python IPProxy.py

+ 1 - 0
IPProxyPool_py3/test/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 15 - 0
IPProxyPool_py3/test/test.py

@@ -0,0 +1,15 @@
+#coding:utf-8
+import requests
+import json
+r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国')
+ip_ports = json.loads(r.text)
+print ip_ports
+ip = ip_ports[0]['ip']
+port = ip_ports[0]['port']
+proxies={
+    'http':'http://%s:%s'%(ip,port),
+    'https':'http://%s:%s'%(ip,port)
+}
+r = requests.get('http://ip.chinaz.com/',proxies=proxies)
+r.encoding='utf-8'
+print r.text

+ 44 - 0
IPProxyPool_py3/test/testIPType.py

@@ -0,0 +1,44 @@
+#coding:utf-8
+from lxml import etree
+import requests
+import config
+
+
+def checkProxyType(selfip,proxies):
+    '''
+    用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
+    :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
+    :return:
+    '''
+
+    try:
+        r = requests.get(url='https://incloak.com/ip/',headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+        print r.text
+        # if r.ok:
+        #     root = etree.HTML(r.text)
+        #     ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
+        #     http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
+        #     http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
+        #     # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
+        #     if ip==selfip:
+        #         return 3
+        #     if http_x_forwared_for is None and http_via is None:
+        #         return 0
+        #     if http_via != None and http_x_forwared_for.find(selfip)== -1:
+        #         return 1
+        #
+        #     if http_via != None and http_x_forwared_for.find(selfip)!= -1:
+        #         return 2
+        # return 3
+
+
+    except Exception,e:
+        print str(e)
+        return 3
+
+
+if __name__=='__main__':
+    ip = '61.132.241.109'
+    port = '808'
+    proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
+    checkProxyType(None,proxies)

+ 10 - 0
IPProxyPool_py3/test/testbase64.py

@@ -0,0 +1,10 @@
+#coding:utf-8
+import base64
+import re
+str='''
+<script type="text/javascript">Proxy('NzcuODcuMjEuODY6ODA4MA==')</script></li>
+'''
+match = re.search('Proxy\(.+\)',str)
+print match.group()
+ip_port = base64.b64decode(match.group().replace("Proxy('","").replace("')",""))
+print ip_port

+ 43 - 0
IPProxyPool_py3/test/testhttpserver.py

@@ -0,0 +1,43 @@
+
+#coding:utf-8
+import BaseHTTPServer
+import json
+import urlparse
+class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
+    def do_GET(self):
+        """
+        """
+        print self.path
+        parsed_path = urlparse.urlparse(self.path)
+        print parsed_path
+        print parsed_path.query
+        # message_parts = [
+        #         'CLIENT VALUES:',
+        #         'client_address=%s (%s)' % (self.client_address,
+        #                                     self.address_string()),
+        #         'command=%s' % self.command,
+        #         'path=%s' % self.path,
+        #         'real path=%s' % parsed_path.path,
+        #         'query=%s' % parsed_path.query,
+        #         'request_version=%s' % self.request_version,
+        #         '',
+        #         'SERVER VALUES:',
+        #         'server_version=%s' % self.server_version,
+        #         'sys_version=%s' % self.sys_version,
+        #         'protocol_version=%s' % self.protocol_version,
+        #         '',
+        #         'HEADERS RECEIVED:',
+        #         ]
+        # for name, value in sorted(self.headers.items()):
+        #     message_parts.append('%s=%s' % (name, value.rstrip()))
+        # message_parts.append('')
+        # message = '\r\n'.join(message_parts)
+        data1 = [{'ip':'192.168.0.0','port':456}]*10
+        d1 = json.dumps(data1,sort_keys=True,indent=4)
+        message=('192.168.1.1',80)
+        self.send_response(200)
+        self.end_headers()
+        self.wfile.write(d1)
+
+server = BaseHTTPServer.HTTPServer(('0.0.0.0',8000), WebRequestHandler)
+server.serve_forever()

+ 39 - 0
IPProxyPool_py3/test/testlist.py

@@ -0,0 +1,39 @@
+#coding:utf-8
+from decimal import Decimal
+
+__author__ = 'Xaxdus'
+
+
+# list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)]
+#
+# list = [(1,10)]*10
+#
+# for m,n in list:
+#     print m,n
+#
+#
+# list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list]
+# print list2
+
+# x=Decimal('0.998531571219').quantize(Decimal('0.00'))
+# a= 0.998531571219
+# value = round(a, 3)
+# print x,type(x),value
+# proxys=[]
+# proxy=[123,1234]
+# proxys.append(proxy)
+#
+# proxy=[123,1234]
+# proxys.append(proxy)
+#
+# print proxys
+# l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}]
+#
+# # for d in l:
+# #    print  [tuple(d.items())]
+# print [tuple(d.items()) for d in l]
+#
+# print [dict(t) for t in set([tuple(d.items()) for d in l])]
+import requests
+r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127')
+print r.text

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 127 - 0
IPProxyPool_py3/test/testlxml.py


+ 8 - 0
IPProxyPool_py3/test/testqueue.py

@@ -0,0 +1,8 @@
+#coding:utf-8
+from multiprocessing import Queue
+try:
+    q = Queue()
+    q.get(timeout=5)
+except BaseException,e:
+    print '--'+str(e)
+

+ 12 - 0
IPProxyPool_py3/test/testsql.py

@@ -0,0 +1,12 @@
+#coding:utf-8
+from db.SqlHelper import SqlHelper
+from util.exception import Con_DB_Fail
+
+try:
+    sqlhelper = SqlHelper()
+    sqlhelper.init_db()
+except Exception,e:
+    raise Con_DB_Fail
+
+proxy ={'ip':'192.168.1.1','port':int('80'),'type':0,'protocol':0,'country':u'中国','area':u'四川','speed':0}
+sqlhelper.insert(proxy)

+ 145 - 0
IPProxyPool_py3/util/IPAddress.py

@@ -0,0 +1,145 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+
+import socket
+import struct
+
+import logging
+logger = logging.getLogger('util')
+
+class IPAddresss:
+    def __init__(self, ipdbFile):
+        self.ipdb = open(ipdbFile, "rb")
+        str = self.ipdb.read(8)
+        (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
+        self.indexCount = (self.lastIndex - self.firstIndex)/7+1
+        # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
+
+    def getVersion(self):
+        s = self.getIpAddr(0xffffff00)
+        return s
+
+    def getAreaAddr(self, offset=0):
+        if offset:
+            self.ipdb.seek(offset)
+        str = self.ipdb.read(1)
+        (byte,) = struct.unpack('B', str)
+        if byte == 0x01 or byte == 0x02:
+            p = self.getLong3()
+            if p:
+                return self.getString(p)
+            else:
+                return ""
+        else:
+            self.ipdb.seek(-1, 1)
+            return self.getString(offset)
+
+    def getAddr(self, offset, ip=0):
+        self.ipdb.seek(offset + 4)
+        countryAddr = ""
+        areaAddr = ""
+        str = self.ipdb.read(1)
+        (byte,) = struct.unpack('B', str)
+        if byte == 0x01:
+            countryOffset = self.getLong3()
+            self.ipdb.seek(countryOffset)
+            str = self.ipdb.read(1)
+            (b,) = struct.unpack('B', str)
+            if b == 0x02:
+                countryAddr = self.getString(self.getLong3())
+                self.ipdb.seek(countryOffset + 4)
+            else:
+                countryAddr = self.getString(countryOffset)
+            areaAddr = self.getAreaAddr()
+        elif byte == 0x02:
+            countryAddr = self.getString(self.getLong3())
+            areaAddr = self.getAreaAddr(offset + 8)
+        else:
+            countryAddr = self.getString(offset + 4)
+            areaAddr = self.getAreaAddr()
+        return countryAddr + " " + areaAddr
+
+    def dump(self, first , last):
+        if last > self.indexCount :
+            last = self.indexCount
+        for index in range(first, last):
+            offset = self.firstIndex + index * 7
+            self.ipdb.seek(offset)
+            buf = self.ipdb.read(7)
+            (ip, of1, of2) = struct.unpack("IHB", buf)
+            address = self.getAddr(of1 + (of2 << 16))
+            # 把GBK转为utf-8
+            address = str(address, 'gbk').encode("utf-8")
+            logger.info("%d %s %s" % (index, self.ip2str(ip), address))
+
+    def setIpRange(self, index):
+        offset = self.firstIndex + index * 7
+        self.ipdb.seek(offset)
+        buf = self.ipdb.read(7)
+        (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
+        self.curEndIpOffset = of1 + (of2 << 16)
+        self.ipdb.seek(self.curEndIpOffset)
+        buf = self.ipdb.read(4)
+        (self.curEndIp,) = struct.unpack("I", buf)
+
+    def getIpAddr(self, ip):
+        L = 0
+        R = self.indexCount - 1
+        while L < R-1:
+            M = (L + R) / 2
+            self.setIpRange(M)
+            if ip == self.curStartIp:
+                L = M
+                break
+            if ip > self.curStartIp:
+                L = M
+            else:
+                R = M
+        self.setIpRange(L)
+        # version information, 255.255.255.X, urgy but useful
+        if ip & 0xffffff00 == 0xffffff00:
+            self.setIpRange(R)
+        if self.curStartIp <= ip <= self.curEndIp:
+            address = self.getAddr(self.curEndIpOffset)
+            # 把GBK转为utf-8
+            address = str(address, 'gbk')
+        else:
+            address = "未找到该IP的地址"
+        return address
+
+    def getIpRange(self, ip):
+        self.getIpAddr(ip)
+        range = self.ip2str(self.curStartIp) + ' - ' \
+            + self.ip2str(self.curEndIp)
+        return range
+
+    def getString(self, offset = 0):
+        if offset :
+            self.ipdb.seek(offset)
+        str = ""
+        ch = self.ipdb.read(1)
+        (byte,) = struct.unpack('B', ch)
+        while byte != 0:
+            str += ch
+            ch = self.ipdb.read(1)
+            (byte,) = struct.unpack('B', ch)
+        return str
+
+    def ip2str(self, ip):
+        return str(ip >> 24)+'.'+str((ip >> 16) & 0xff)+'.'+str((ip >> 8) & 0xff)+'.'+str(ip & 0xff)
+
+    def str2ip(self, s):
+        (ip,) = struct.unpack('I', socket.inet_aton(s))
+        return ((ip >> 24) & 0xff) | ((ip & 0xff) << 24) | ((ip >> 8) & 0xff00) | ((ip & 0xff00) << 8)
+
+    def getLong3(self, offset=0):
+        if offset:
+            self.ipdb.seek(offset)
+        str = self.ipdb.read(3)
+        (a, b) = struct.unpack('HB', str)
+        return (b << 16) + a
+
+
+

+ 4 - 0
IPProxyPool_py3/util/__init__.py

@@ -0,0 +1,4 @@
+
+
+__author__ = 'Xaxdus'
+

+ 16 - 0
IPProxyPool_py3/util/exception.py

@@ -0,0 +1,16 @@
+#coding:utf-8
+import config
+
+
+class Test_URL_Fail(Exception):
+
+
+    def __str__(self):
+        str = "访问%s失败,请检查网络连接"%config.TEST_URL
+        return str
+
+class Con_DB_Fail(Exception):
+
+    def __str__(self):
+        str = "使用DB_CONNECT_STRING:%s--连接数据库失败"%config.DB_CONNECT_STRING
+        return str

+ 10 - 0
IPProxyPool_py3/util/logger.py

@@ -0,0 +1,10 @@
+#coding:utf-8
+import logging
+
+__author__ = 'qiye'
+
+
+logger = logging.getLogger()
+def logger_proxy(proxy):
+   logger.setLevel(logging.INFO)
+   logger.info(proxy)

+ 152 - 0
IPProxyPool_py3/validator/Validator.py

@@ -0,0 +1,152 @@
+#coding:utf-8
+import json
+from multiprocessing import Process
+import re
+import gevent
+
+from lxml import etree
+import requests
+import time
+from config import TEST_URL
+import config
+from db.DataStore import sqlhelper
+from util.exception import Test_URL_Fail
+
+
+
+from gevent import monkey
+monkey.patch_all()
+
+
+def detect_from_db(myip,proxy,proxies_set):
+    proxy_dict = {'ip':proxy[0],'port':proxy[1]}
+    result = detect_list(myip,proxy_dict)
+    if result:
+        if proxy[2]<60000:
+            score = proxy[2] + 1
+        else:
+            score = 60000
+        proxy_str ='%s:%s'%(proxy[0],proxy[1])
+        proxies_set.add(proxy_str)
+        sqlhelper.update({'ip':proxy[0],'port':proxy[1]},{'score':score})
+    else:
+        sqlhelper.delete({'ip':proxy[0],'port':proxy[1]})
+
+
+    pass
+
+
+def validator(queue1,queue2):
+    tasklist=[]
+    myip = getMyIP()
+    while True:
+        try:
+            # proxy_dict = {'source':'crawl','data':proxy}
+            proxy = queue1.get(timeout=10)
+            tasklist.append(proxy)
+            if len(tasklist)>500:
+                p = Process(target=process_start,args=(tasklist,myip,queue2))
+                p.start()
+                tasklist=[]
+        except Exception as e:
+            if len(tasklist)>0:
+                p = Process(target=process_start,args=(tasklist,myip,queue2))
+                p.start()
+                tasklist=[]
+
+
+def process_start(tasks,myip,queue2):
+    spawns = []
+    for task in tasks:
+        spawns.append(gevent.spawn(detect_list,myip,task,queue2))
+    gevent.joinall(spawns)
+
+
+def detect_list(selfip,proxy,queue2=None):
+    '''
+    :param proxy: ip字典
+    :return:
+    '''
+    ip = proxy['ip']
+    port = proxy['port']
+    proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
+    # proxyType = checkProxyType(selfip,proxies)
+    # if proxyType==3:
+    #     logger.info('failed %s:%s'%(ip,port))
+    #     proxy = None
+    #     queue2.put(proxy)
+    #     return proxy
+    # else:
+    #     proxy['type']=proxyType
+    proxy['type']=0
+    start = time.time()
+    try:
+        r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+
+        if not r.ok or r.text.find(ip)==-1:
+            proxy = None
+        else:
+            speed = round(time.time()-start,2)
+            proxy['speed']=speed
+    except Exception as e:
+            proxy = None
+
+    if queue2:
+        queue2.put(proxy)
+    return proxy
+
+def checkProxyType(selfip,proxies):
+    '''
+    用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
+    :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
+    :return:
+    '''
+
+    try:
+
+        r = requests.get(url=config.TEST_PROXY,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+        if r.ok:
+            root = etree.HTML(r.text)
+            ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
+            http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
+            http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
+            # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
+            if ip==selfip:
+                return 3
+            if http_x_forwared_for is None and http_via is None:
+                return 0
+            if http_via != None and http_x_forwared_for.find(selfip)== -1:
+                return 1
+
+            if http_via != None and http_x_forwared_for.find(selfip)!= -1:
+                return 2
+        return 3
+
+
+
+    except Exception as e:
+        return 3
+
+
+
+
+def getMyIP():
+    try:
+        r = requests.get(url=config.TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT)
+        pattern = '\d+\.\d+\.\d+\.\d+'
+        match =re.search(pattern,r.text)
+        if match:
+            ip = match.group()
+            return ip
+        else:
+
+            raise Test_URL_Fail
+    except Exception as e:
+            raise Test_URL_Fail
+
+if __name__=='__main__':
+    getMyIP()
+    # str="{ip:'61.150.43.121',address:'陕西省西安市 西安电子科技大学'}"
+    # j = json.dumps(str)
+    # str = j['ip']
+    # print str

+ 1 - 0
IPProxyPool_py3/validator/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 2 - 4
README.md

@@ -1,5 +1,5 @@
 # IPProxyPool
-IPProxyPool代理池项目,提供代理ip。使用python2.7.x开发,之后会支持python3
+IPProxyPool代理池项目,提供代理ip。支持py2和py3两个版本
 <br/>
 由于验证ip匿名性的网站挂掉了,因此现在抓取的ip无法判断是否匿名,特此说明。
 <br/>
@@ -193,9 +193,7 @@ print r.text
 ```
 
 ## TODO
-1.添加对Python3.x的支持
-<br/>
-2.可自主选择添加squid反向代理服务器,简化爬虫配置
+1.可自主选择添加squid反向代理服务器,简化爬虫配置
 <br/>
 
 

BIN
data/proxy.db


+ 0 - 1
start.bat

@@ -1 +0,0 @@
-python IPProxys.py

Nem az összes módosított fájl került megjelenítésre, mert túl sok fájl változott