Ver código fonte

合并py2和py3版本

qiyeboy 8 anos atrás
pai
commit
acce5a7a4d
67 arquivos alterados com 85 adições e 1617 exclusões
  1. 3 5
      IPProxy.py
  2. 0 193
      IPProxyPool_py2/config.py
  3. BIN
      IPProxyPool_py2/data/proxy.db
  4. 0 49
      IPProxyPool_py2/db/DataStore.py
  5. 0 31
      IPProxyPool_py2/db/RedisHelper.py
  6. 0 146
      IPProxyPool_py2/db/SqlHelper.py
  7. 0 73
      IPProxyPool_py2/spider/HtmlDownLoader.py
  8. 0 168
      IPProxyPool_py2/spider/HtmlPraser.py
  9. 0 97
      IPProxyPool_py2/spider/ProxyCrawl.py
  10. 0 16
      IPProxyPool_py2/test/test.py
  11. 0 8
      IPProxyPool_py2/test/testHttpbin.py
  12. 0 44
      IPProxyPool_py2/test/testIPType.py
  13. 0 11
      IPProxyPool_py2/test/testbase64.py
  14. 0 45
      IPProxyPool_py2/test/testhttpserver.py
  15. 0 40
      IPProxyPool_py2/test/testlist.py
  16. 0 127
      IPProxyPool_py2/test/testlxml.py
  17. 0 9
      IPProxyPool_py2/test/testqueue.py
  18. 0 147
      IPProxyPool_py2/util/IPAddress.py
  19. 0 11
      IPProxyPool_py2/util/logger.py
  20. 0 170
      IPProxyPool_py2/validator/Validator.py
  21. 0 33
      IPProxyPool_py3/IPProxy.py
  22. 0 1
      IPProxyPool_py3/api/__init__.py
  23. 0 43
      IPProxyPool_py3/api/apiServer.py
  24. BIN
      IPProxyPool_py3/data/qqwry.dat
  25. 0 22
      IPProxyPool_py3/db/ISqlHelper.py
  26. 0 58
      IPProxyPool_py3/db/MongoHelper.py
  27. 0 1
      IPProxyPool_py3/db/__init__.py
  28. 0 1
      IPProxyPool_py3/spider/__init__.py
  29. 0 1
      IPProxyPool_py3/start.bat
  30. 0 1
      IPProxyPool_py3/test/__init__.py
  31. 0 12
      IPProxyPool_py3/test/testsql.py
  32. 0 2
      IPProxyPool_py3/util/__init__.py
  33. 0 14
      IPProxyPool_py3/util/exception.py
  34. 0 1
      IPProxyPool_py3/validator/__init__.py
  35. 9 1
      README.md
  36. 0 0
      api/__init__.py
  37. 0 0
      api/apiServer.py
  38. 0 0
      config.py
  39. 0 0
      data/qqwry.dat
  40. 0 0
      db/DataStore.py
  41. 0 0
      db/ISqlHelper.py
  42. 20 2
      db/MongoHelper.py
  43. 0 0
      db/RedisHelper.py
  44. 0 0
      db/SqlHelper.py
  45. 0 0
      db/__init__.py
  46. 0 1
      spider/HtmlDownloader.py
  47. 22 21
      spider/HtmlPraser.py
  48. 0 1
      spider/ProxyCrawl.py
  49. 0 0
      spider/__init__.py
  50. 0 0
      start.bat
  51. 0 0
      test/__init__.py
  52. 0 0
      test/test.py
  53. 0 0
      test/testIPAddress.py
  54. 1 1
      test/testIPType.py
  55. 0 0
      test/testbase64.py
  56. 1 1
      test/testhttpserver.py
  57. 0 0
      test/testlist.py
  58. 0 0
      test/testlxml.py
  59. 0 0
      test/testqueue.py
  60. 0 0
      test/testsql.py
  61. 7 6
      util/IPAddress.py
  62. 0 0
      util/__init__.py
  63. 22 0
      util/compatibility.py
  64. 0 0
      util/exception.py
  65. 0 0
      util/logger.py
  66. 0 3
      validator/Validator.py
  67. 0 0
      validator/__init__.py

+ 3 - 5
IPProxyPool_py2/IPProxy.py → IPProxy.py

@@ -1,13 +1,11 @@
 # coding:utf-8
+
 from multiprocessing import Value, Queue, Process
 from api.apiServer import start_api_server
-import sys
 from db.DataStore import store_data
-from spider.ProxyCrawl import startProxyCrawl
-from validator.Validator import validator, getMyIP
 
-reload(sys)
-sys.setdefaultencoding('utf8')
+from validator.Validator import validator, getMyIP
+from spider.ProxyCrawl import startProxyCrawl
 
 if __name__ == "__main__":
     myip = getMyIP()

+ 0 - 193
IPProxyPool_py2/config.py

@@ -1,193 +0,0 @@
-# coding:utf-8
-'''
-定义规则 urls:url列表
-         type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
-         patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应
-'''
-from multiprocessing import Value
-import os
-import random
-
-'''
-ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
- speed(连接速度)
-'''
-parserList = [
-    {
-        'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + range(2, 12)],
-        'type': 'xpath',
-        'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
-    },
-    {
-        'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
-        'type': 'xpath',
-        'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
-    },
-    {
-        'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'],
-        'type': 'xpath',
-        'pattern': ".//table[@class='sortable']/tbody/tr",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
-
-    },
-    {
-        'urls': ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)],
-        'type': 'xpath',
-        'pattern': ".//table[@class='list']/tr",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
-
-    },
-    {
-        'urls': ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)],
-        'type': 'module',
-        'moduleName': 'proxy_listPraser',
-        'pattern': 'Proxy\(.+\)',
-        'position': {'ip': 0, 'port': -1, 'type': -1, 'protocol': 2}
-
-    },
-    {
-        'urls': ['http://incloak.com/proxy-list/%s#list' % n for n in
-                 ([''] + ['?start=%s' % (64 * m) for m in range(1, 10)])],
-        'type': 'xpath',
-        'pattern': ".//table[@class='proxy__t']/tbody/tr",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
-
-    },
-    {
-        'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 11)],
-        'type': 'xpath',
-        'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
-    },
-    {
-        'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in
-                 range(1, 11)],
-        'type': 'xpath',
-        'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
-    },
-    {
-        'urls': ['http://www.cz88.net/proxy/%s' % m for m in
-                 ['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]],
-        'type': 'xpath',
-        'pattern': ".//*[@id='boxright']/div/ul/li[position()>1]",
-        'position': {'ip': './div[1]', 'port': './div[2]', 'type': './div[3]', 'protocol': ''}
-
-    },
-    {
-        'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)],
-        'type': 'xpath',
-        'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
-        'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
-
-    },
-    {
-        'urls': ['http://www.xicidaili.com/%s/%s' % (m, n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8)],
-        'type': 'xpath',
-        'pattern': ".//*[@id='ip_list']/tr[position()>1]",
-        'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]', 'protocol': './td[6]'}
-    },
-    {
-        'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
-        'type': 'module',
-        'moduleName': 'CnproxyPraser',
-        'pattern': r'<tr><td>(\d+\.\d+\.\d+\.\d+)<SCRIPT type=text/javascript>document.write\(\"\:\"(.+)\)</SCRIPT></td><td>(HTTP|SOCKS4)\s*',
-        'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
-    }
-]
-'''
-数据库的配置
-'''
-DB_CONFIG = {
-
-    'DB_CONNECT_TYPE': 'sqlalchemy',  # 'pymongo'sqlalchemy
-    # 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
-    'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db'
-    # DB_CONNECT_STRING = 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
-
-
-}
-CHINA_AREA = [u'河北', u'山东', u'辽宁', u'黑龙江', u'吉林'
-    , u'甘肃', u'青海', u'河南', u'江苏', u'湖北', u'湖南',
-              u'江西', u'浙江', u'广东', u'云南', u'福建',
-              u'台湾', u'海南', u'山西', u'四川', u'陕西',
-              u'贵州', u'安徽', u'重庆', u'北京', u'上海', u'天津', u'广西', u'内蒙', u'西藏', u'新疆', u'宁夏', u'香港', u'澳门']
-QQWRY_PATH = os.path.dirname(__file__) + "/data/qqwry.dat"
-
-THREADNUM = 5
-API_PORT = 8000
-'''
-爬虫爬取和检测ip的设置条件
-不需要检测ip是否已经存在,因为会定时清理
-'''
-UPDATE_TIME = 60 * 60  # 每一个小时检测一次是否有代理ip失效
-MINNUM = 40  # 当有效的ip值小于一个时 需要启动爬虫进行爬取
-MAXTIME = 3 * 24 * 60  # 当爬取存储开始一直使用的最大时间,如果超过这个时间,都删除
-
-TIMEOUT = 8  # socket延时
-
-'''
-反爬虫的设置
-'''
-'''
-重试次数
-'''
-RETRY_TIME = 3
-
-'''
-USER_AGENTS 随机头信息
-'''
-USER_AGENTS = [
-    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
-    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
-    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
-    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
-    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
-    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
-    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
-    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
-    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
-    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
-    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
-    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
-    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
-    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
-    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
-    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
-    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
-    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
-    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
-    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
-    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
-    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
-    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
-    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
-    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
-    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
-    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
-    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
-]
-
-HEADER = {
-    'User-Agent': random.choice(USER_AGENTS),
-    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    'Accept-Language': 'en-US,en;q=0.5',
-    'Connection': 'keep-alive',
-    'Accept-Encoding': 'gzip, deflate',
-}
-
-TEST_URL = 'http://httpbin.org/ip'  # 'http://ip.chinaz.com/getip.aspx'
-TEST_IP = 'http://httpbin.org/ip'
-TEST_HTTP_HEADER = 'http://httpbin.org/get'
-TEST_HTTPS_HEADER = 'https://httpbin.org/get'
-# #添加的检测关键字,修复测试的代理是否能真正的访问到目的网址
-# TEST_KEY = '站长工具'
-TEST_PROXY = 'http://www.stilllistener.com/checkpoint1/test11/'

BIN
IPProxyPool_py2/data/proxy.db


+ 0 - 49
IPProxyPool_py2/db/DataStore.py

@@ -1,49 +0,0 @@
-# coding:utf-8
-import sys
-from config import DB_CONFIG
-from util.exception import Con_DB_Fail
-
-
-try:
-    if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo':
-        from db.MongoHelper import MongoHelper as SqlHelper
-    else:
-        from db.SqlHelper import SqlHelper as SqlHelper
-    sqlhelper = SqlHelper()
-    sqlhelper.init_db()
-except Exception, e:
-    raise Con_DB_Fail
-
-
-def store_data(queue2, db_proxy_num):
-    '''
-    读取队列中的数据,写入数据库中
-    :param queue2:
-    :return:
-    '''
-    successNum = 0
-    failNum = 0
-    while True:
-        try:
-            proxy = queue2.get(timeout=300)
-            if proxy:
-
-                sqlhelper.insert(proxy)
-                successNum += 1
-            else:
-                failNum += 1
-            str = u'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum)
-            sys.stdout.write(str + "\r")
-            sys.stdout.flush()
-        except BaseException, e:
-
-            if db_proxy_num.value != 0:
-                successNum += db_proxy_num.value
-                db_proxy_num.value = 0
-                str = u'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum)
-                sys.stdout.write(str + "\r")
-                sys.stdout.flush()
-                successNum = 0
-                failNum = 0
-
-

+ 0 - 31
IPProxyPool_py2/db/RedisHelper.py

@@ -1,31 +0,0 @@
-# coding:utf-8
-import pymongo
-from db.ISqlHelper import ISqlHelper
-
-
-class RedisHelper(ISqlHelper):
-    def __init__(self):
-        pass
-
-    def init_db(self):
-        pass
-
-
-    def drop_db(self):
-        pass
-
-
-    def insert(self, value):
-        pass
-
-
-    def delete(self, conditions):
-        pass
-
-    def update(self, conditions, value):
-        pass
-
-    def select(self, count=None, conditions=[]):
-        pass
-
-

+ 0 - 146
IPProxyPool_py2/db/SqlHelper.py

@@ -1,146 +0,0 @@
-# coding:utf-8
-import datetime
-from sqlalchemy import Column, Integer, String, DateTime, Numeric, create_engine, VARCHAR
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import sessionmaker
-from config import DB_CONFIG
-
-from db.ISqlHelper import ISqlHelper
-
-'''
-sql操作的基类
-包括ip,端口,types类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
- speed(连接速度)
-'''
-
-BaseModel = declarative_base()
-
-
-class Proxy(BaseModel):
-    __tablename__ = 'proxys'
-    id = Column(Integer, primary_key=True, autoincrement=True)
-    ip = Column(VARCHAR(16), nullable=False)
-    port = Column(Integer, nullable=False)
-    types = Column(Integer, nullable=False)
-    protocol = Column(Integer, nullable=False, default=0)
-    country = Column(VARCHAR(100), nullable=False)
-    area = Column(VARCHAR(100), nullable=False)
-    updatetime = Column(DateTime(), default=datetime.datetime.utcnow)
-    speed = Column(Numeric(5, 2), nullable=False)
-    score = Column(Integer, nullable=False, default=0)
-
-
-class SqlHelper(ISqlHelper):
-    params = {'ip': Proxy.ip, 'port': Proxy.port, 'types': Proxy.types, 'protocol': Proxy.protocol,
-              'country': Proxy.country, 'area': Proxy.area, 'score': Proxy.score}
-
-    def __init__(self):
-        if 'sqlite' in DB_CONFIG['DB_CONNECT_STRING']:
-            connect_args = {'check_same_thread': False}
-            self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False, connect_args=connect_args)
-        else:
-            self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False)
-        DB_Session = sessionmaker(bind=self.engine)
-        self.session = DB_Session()
-
-    def init_db(self):
-        BaseModel.metadata.create_all(self.engine)
-
-    def drop_db(self):
-        BaseModel.metadata.drop_all(self.engine)
-
-
-    def insert(self, value):
-        proxy = Proxy(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
-                      country=value['country'],
-                      area=value['area'], speed=value['speed'])
-        self.session.add(proxy)
-        self.session.commit()
-
-
-    def delete(self, conditions=None):
-        if conditions:
-            conditon_list = []
-            for key in conditions.keys():
-                if self.params.get(key, None):
-                    conditon_list.append(self.params.get(key) == conditions.get(key))
-            conditions = conditon_list
-            query = self.session.query(Proxy)
-            for condition in conditions:
-                query = query.filter(condition)
-            deleteNum = query.delete()
-            self.session.commit()
-        else:
-            deleteNum = 0
-        return ('deleteNum', deleteNum)
-
-
-    def update(self, conditions=None, value=None):
-        '''
-        conditions的格式是个字典。类似self.params
-        :param conditions:
-        :param value:也是个字典:{'ip':192.168.0.1}
-        :return:
-        '''
-        if conditions and value:
-            conditon_list = []
-            for key in conditions.keys():
-                if self.params.get(key, None):
-                    conditon_list.append(self.params.get(key) == conditions.get(key))
-            conditions = conditon_list
-            query = self.session.query(Proxy)
-            for condition in conditions:
-                query = query.filter(condition)
-            updatevalue = {}
-            for key in value.keys():
-                if self.params.get(key, None):
-                    updatevalue[self.params.get(key, None)] = value.get(key)
-            updateNum = query.update(updatevalue)
-            self.session.commit()
-        else:
-            updateNum = 0
-        return {'updateNum': updateNum}
-
-
-    def select(self, count=None, conditions=None):
-        '''
-        conditions的格式是个字典。类似self.params
-        :param count:
-        :param conditions:
-        :return:
-        '''
-        if conditions:
-            conditon_list = []
-            for key in conditions.keys():
-                if self.params.get(key, None):
-                    conditon_list.append(self.params.get(key) == conditions.get(key))
-            conditions = conditon_list
-        else:
-            conditions = []
-
-        query = self.session.query(Proxy.ip, Proxy.port, Proxy.score)
-        if len(conditions) > 0 and count:
-            for condition in conditions:
-                query = query.filter(condition)
-            return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all()
-        elif count:
-            return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all()
-        elif len(conditions) > 0:
-            for condition in conditions:
-                query = query.filter(condition)
-            return query.order_by(Proxy.score.desc(), Proxy.speed).all()
-        else:
-            return query.order_by(Proxy.score.desc(), Proxy.speed).all()
-
-
-    def close(self):
-        pass
-
-
-if __name__ == '__main__':
-    sqlhelper = SqlHelper()
-    sqlhelper.init_db()
-    proxy = {'ip': '192.168.1.1', 'port': 80, 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'广州',
-             'speed': 11.123}
-    sqlhelper.insert(proxy)
-

+ 0 - 73
IPProxyPool_py2/spider/HtmlDownLoader.py

@@ -1,73 +0,0 @@
-# coding:utf-8
-
-import random
-import config
-import json
-from db.DataStore import sqlhelper
-
-__author__ = 'Xaxdus'
-
-import requests
-import chardet
-
-
-class Html_Downloader(object):
-    @classmethod
-    def download(self, url):
-        count = 0  # 重试次数
-        r = ''
-        try:
-            r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT)
-            r.encoding = chardet.detect(r.content)['encoding']
-            while count < config.RETRY_TIME:
-                if (not r.ok) or len(r.content) < 500:
-                    proxylist = sqlhelper.select(10)
-                    proxy = random.choice(proxylist)
-                    ip = proxy[0]
-                    port = proxy[1]
-                    proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
-                    try:
-                        r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
-                        r.encoding = chardet.detect(r.content)['encoding']
-                        count += 1
-                    except Exception, e:
-                        count += 1
-
-
-                else:
-                    return r.text
-
-            return None
-
-        except Exception, e:
-            while count < config.RETRY_TIME:
-                if r == '' or (not r.ok) or len(r.content) < 500:
-                    try:
-                        proxylist = sqlhelper.select(10)
-                        proxy = random.choice(proxylist)
-                        ip = proxy[0]
-                        port = proxy[1]
-                        proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
-                        try:
-                            r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
-                            r.encoding = chardet.detect(r.content)['encoding']
-                            count += 1
-                        except Exception, e:
-                            count += 1
-
-                    except Exception, e:
-                        return None
-
-                else:
-                    return r.text
-
-            return None
-
-
-
-
-
-
-
-
-

+ 0 - 168
IPProxyPool_py2/spider/HtmlPraser.py

@@ -1,168 +0,0 @@
-# coding:utf-8
-import base64
-import datetime
-from config import QQWRY_PATH, CHINA_AREA
-
-from util.IPAddress import IPAddresss
-import re
-
-__author__ = 'qiye'
-from lxml import etree
-
-
-class Html_Parser(object):
-    def __init__(self):
-        self.ips = IPAddresss(QQWRY_PATH)
-
-    def parse(self, response, parser):
-        '''
-
-        :param response: 响应
-        :param type: 解析方式
-        :return:
-        '''
-        if parser['type'] == 'xpath':
-            return self.XpathPraser(response, parser)
-        elif parser['type'] == 'regular':
-            return self.RegularPraser(response, parser)
-        elif parser['type'] == 'module':
-            return getattr(self, parser['moduleName'], None)(response, parser)
-        else:
-            return None
-
-    def AuthCountry(self, addr):
-        '''
-        用来判断地址是哪个国家的
-        :param addr:
-        :return:
-        '''
-        for area in CHINA_AREA:
-            if addr.find(area) != -1:
-                return True
-        return False
-
-
-    def XpathPraser(self, response, parser):
-        '''
-        针对xpath方式进行解析
-        :param response:
-        :param parser:
-        :return:
-        '''
-        # print response
-        proxylist = []
-        root = etree.HTML(response)
-        proxys = root.xpath(parser['pattern'])
-        for proxy in proxys:
-
-            try:
-                ip = proxy.xpath(parser['position']['ip'])[0].text
-                port = proxy.xpath(parser['position']['port'])[0].text
-                type = 0
-                protocol = 0
-                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
-                country = ''
-                area = ''
-                if addr.find(u'省') != -1 or self.AuthCountry(addr):
-                    country = u'国内'
-                    area = addr
-                else:
-                    country = u'国外'
-                    area = addr
-            except Exception, e:
-                continue
-            # updatetime = datetime.datetime.now()
-            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
-            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
-            proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
-                     'area': area, 'speed': 100}
-            proxylist.append(proxy)
-        return proxylist
-
-    def RegularPraser(self, response, parser):
-        '''
-        针对正则表达式进行解析
-        :param response:
-        :param parser:
-        :return:
-        '''
-        proxylist = []
-        pattern = re.compile(parser['pattern'])
-        matchs = pattern.findall(response)
-        if matchs != None:
-            for match in matchs:
-                ip = match[parser['position']['ip']]
-                port = match[parser['position']['port']]
-                # 网站的类型一直不靠谱所以还是默认,之后会检测
-                type = 0
-                # if parser['postion']['protocol'] > 0:
-                # protocol = match[parser['postion']['protocol']]
-                #     if protocol.lower().find('https')!=-1:
-                #         protocol = 1
-                #     else:
-                #         protocol = 0
-                # else:
-                protocol = 0
-                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
-                country = ''
-                area = ''
-                if addr.find(u'省') != -1 or self.AuthCountry(addr):
-                    country = u'中国'
-                    area = addr
-                else:
-                    country = addr
-                    area = ''
-                proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area,
-                         'speed': 100}
-
-                proxylist.append(proxy)
-            return proxylist
-
-
-    def CnproxyPraser(self, response, parser):
-        proxylist = self.RegularPraser(response, parser)
-        chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'}
-
-        for proxy in proxylist:
-            port = proxy['port']
-            new_port = ''
-            for i in range(len(port)):
-                if port[i] != '+':
-                    new_port += chardict[port[i]]
-            new_port = int(new_port)
-            proxy['port'] = new_port
-        return proxylist
-
-
-    def proxy_listPraser(self, response, parser):
-        proxylist = []
-        pattern = re.compile(parser['pattern'])
-        matchs = pattern.findall(response)
-        if matchs:
-            for match in matchs:
-                ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
-                ip = ip_port.split(':')[0]
-                port = ip_port.split(':')[1]
-                type = 0
-                protocol = 0
-                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
-                country = ''
-                area = ''
-                if addr.find(u'省') != -1 or self.AuthCountry(addr):
-                    country = u'中国'
-                    area = addr
-                else:
-                    country = addr
-                    area = ''
-                proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
-                         'area': area, 'speed': 100}
-
-                proxylist.append(proxy)
-            return proxylist
-
-
-
-
-
-
-

+ 0 - 97
IPProxyPool_py2/spider/ProxyCrawl.py

@@ -1,97 +0,0 @@
-# coding:utf-8
-import gevent
-from gevent.pool import Pool
-from multiprocessing import Queue, Process, Value
-import time
-import sys
-from api.apiServer import start_api_server
-from config import THREADNUM, parserList, UPDATE_TIME, MINNUM
-import config
-from db.DataStore import store_data, sqlhelper
-from spider.HtmlDownLoader import Html_Downloader
-from spider.HtmlPraser import Html_Parser
-from validator.Validator import validator, getMyIP, detect_from_db
-
-
-__author__ = 'qiye'
-from gevent import monkey
-
-monkey.patch_all()
-'''
-这个类的作用是描述爬虫的逻辑
-'''
-
-
-def startProxyCrawl(queue, db_proxy_num):
-    crawl = ProxyCrawl(queue, db_proxy_num)
-    crawl.run()
-
-
-class ProxyCrawl(object):
-    proxies = set()
-
-    def __init__(self, queue, db_proxy_num):
-        self.crawl_pool = Pool(THREADNUM)
-        self.queue = queue
-        self.db_proxy_num = db_proxy_num
-
-
-    def run(self):
-        while True:
-            self.proxies.clear()
-            str = u'IPProxyPool----->>>>>>>>beginning'
-            sys.stdout.write(str + "\r\n")
-            sys.stdout.flush()
-            proxylist = sqlhelper.select()
-            myip = getMyIP()
-            spawns = []
-            for proxy in proxylist:
-                spawns.append(gevent.spawn(detect_from_db, myip, proxy, self.proxies))
-            gevent.joinall(spawns)
-            self.db_proxy_num.value = len(self.proxies)
-            str = u'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)
-
-            if len(self.proxies) < MINNUM:
-                str += u'\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
-                sys.stdout.write(str + "\r\n")
-                sys.stdout.flush()
-                self.crawl_pool.map(self.crawl, parserList)
-            else:
-                str += u'\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
-                sys.stdout.write(str + "\r\n")
-                sys.stdout.flush()
-
-            time.sleep(UPDATE_TIME)
-
-
-    def crawl(self, parser):
-        html_parser = Html_Parser()
-        for url in parser['urls']:
-            response = Html_Downloader.download(url)
-            if response != None:
-                proxylist = html_parser.parse(response, parser)
-                if proxylist != None:
-                    for proxy in proxylist:
-                        proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
-                        if proxy_str not in self.proxies:
-                            self.proxies.add(proxy_str)
-                            self.queue.put(proxy)
-
-
-if __name__ == "__main__":
-    DB_PROXY_NUM = Value('i', 0)
-    q1 = Queue()
-    q2 = Queue()
-    p0 = Process(target=start_api_server)
-    p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM))
-    p2 = Process(target=validator, args=(q1, q2))
-    p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
-
-    p0.start()
-    p1.start()
-    p2.start()
-    p3.start()
-
-
-    # spider = ProxyCrawl()
-    # spider.run()

+ 0 - 16
IPProxyPool_py2/test/test.py

@@ -1,16 +0,0 @@
-# coding:utf-8
-import requests
-import json
-
-r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国')
-ip_ports = json.loads(r.text)
-print ip_ports
-ip = ip_ports[0]['ip']
-port = ip_ports[0]['port']
-proxies = {
-    'http': 'http://%s:%s' % (ip, port),
-    'https': 'http://%s:%s' % (ip, port)
-}
-r = requests.get('http://ip.chinaz.com/', proxies=proxies)
-r.encoding = 'utf-8'
-print r.text

+ 0 - 8
IPProxyPool_py2/test/testHttpbin.py

@@ -1,8 +0,0 @@
-# coding:utf-8
-import json
-import requests
-import config
-
-r = requests.get(url=config.TEST_IP, headers=config.HEADER, timeout=config.TIMEOUT)
-json = json.loads(r.text)
-print json['origin']

+ 0 - 44
IPProxyPool_py2/test/testIPType.py

@@ -1,44 +0,0 @@
-# coding:utf-8
-from lxml import etree
-import requests
-import config
-
-
-def checkProxyType(selfip, proxies):
-    '''
-    用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
-    :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
-    :return:
-    '''
-
-    try:
-        r = requests.get(url='https://incloak.com/ip/', headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
-        print r.text
-        # if r.ok:
-        # root = etree.HTML(r.text)
-        # ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
-        #     http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
-        #     http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
-        #     # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
-        #     if ip==selfip:
-        #         return 3
-        #     if http_x_forwared_for is None and http_via is None:
-        #         return 0
-        #     if http_via != None and http_x_forwared_for.find(selfip)== -1:
-        #         return 1
-        #
-        #     if http_via != None and http_x_forwared_for.find(selfip)!= -1:
-        #         return 2
-        # return 3
-
-
-    except Exception, e:
-        print str(e)
-        return 3
-
-
-if __name__ == '__main__':
-    ip = '61.132.241.109'
-    port = '808'
-    proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
-    checkProxyType(None, proxies)

+ 0 - 11
IPProxyPool_py2/test/testbase64.py

@@ -1,11 +0,0 @@
-# coding:utf-8
-import base64
-import re
-
-str = '''
-<script type="text/javascript">Proxy('NzcuODcuMjEuODY6ODA4MA==')</script></li>
-'''
-match = re.search('Proxy\(.+\)', str)
-print match.group()
-ip_port = base64.b64decode(match.group().replace("Proxy('", "").replace("')", ""))
-print ip_port

+ 0 - 45
IPProxyPool_py2/test/testhttpserver.py

@@ -1,45 +0,0 @@
-# coding:utf-8
-import BaseHTTPServer
-import json
-import urlparse
-
-
-class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
-    def do_GET(self):
-        """
-        """
-        print self.path
-        parsed_path = urlparse.urlparse(self.path)
-        print parsed_path
-        print parsed_path.query
-        # message_parts = [
-        # 'CLIENT VALUES:',
-        # 'client_address=%s (%s)' % (self.client_address,
-        #                                     self.address_string()),
-        #         'command=%s' % self.command,
-        #         'path=%s' % self.path,
-        #         'real path=%s' % parsed_path.path,
-        #         'query=%s' % parsed_path.query,
-        #         'request_version=%s' % self.request_version,
-        #         '',
-        #         'SERVER VALUES:',
-        #         'server_version=%s' % self.server_version,
-        #         'sys_version=%s' % self.sys_version,
-        #         'protocol_version=%s' % self.protocol_version,
-        #         '',
-        #         'HEADERS RECEIVED:',
-        #         ]
-        # for name, value in sorted(self.headers.items()):
-        #     message_parts.append('%s=%s' % (name, value.rstrip()))
-        # message_parts.append('')
-        # message = '\r\n'.join(message_parts)
-        data1 = [{'ip': '192.168.0.0', 'port': 456}] * 10
-        d1 = json.dumps(data1, sort_keys=True, indent=4)
-        message = ('192.168.1.1', 80)
-        self.send_response(200)
-        self.end_headers()
-        self.wfile.write(d1)
-
-
-server = BaseHTTPServer.HTTPServer(('0.0.0.0', 8000), WebRequestHandler)
-server.serve_forever()

+ 0 - 40
IPProxyPool_py2/test/testlist.py

@@ -1,40 +0,0 @@
-# coding:utf-8
-from decimal import Decimal
-
-__author__ = 'Xaxdus'
-
-
-# list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)]
-#
-# list = [(1,10)]*10
-#
-# for m,n in list:
-# print m,n
-#
-#
-# list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list]
-# print list2
-
-# x=Decimal('0.998531571219').quantize(Decimal('0.00'))
-# a= 0.998531571219
-# value = round(a, 3)
-# print x,type(x),value
-# proxys=[]
-# proxy=[123,1234]
-# proxys.append(proxy)
-#
-# proxy=[123,1234]
-# proxys.append(proxy)
-#
-# print proxys
-# l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}]
-#
-# # for d in l:
-# #    print  [tuple(d.items())]
-# print [tuple(d.items()) for d in l]
-#
-# print [dict(t) for t in set([tuple(d.items()) for d in l])]
-import requests
-
-r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127')
-print r.text

Diferenças do arquivo suprimidas por serem muito extensas
+ 0 - 127
IPProxyPool_py2/test/testlxml.py


+ 0 - 9
IPProxyPool_py2/test/testqueue.py

@@ -1,9 +0,0 @@
-# coding:utf-8
-from multiprocessing import Queue
-
-try:
-    q = Queue()
-    q.get(timeout=5)
-except BaseException, e:
-    print '--' + str(e)
-

+ 0 - 147
IPProxyPool_py2/util/IPAddress.py

@@ -1,147 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
-
-import socket
-import struct
-
-import logging
-
-logger = logging.getLogger('util')
-
-
-class IPAddresss:
-    def __init__(self, ipdbFile):
-        self.ipdb = open(ipdbFile, "rb")
-        str = self.ipdb.read(8)
-        (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
-        self.indexCount = (self.lastIndex - self.firstIndex) / 7 + 1
-        # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
-
-    def getVersion(self):
-        s = self.getIpAddr(0xffffff00L)
-        return s
-
-    def getAreaAddr(self, offset=0):
-        if offset:
-            self.ipdb.seek(offset)
-        str = self.ipdb.read(1)
-        (byte,) = struct.unpack('B', str)
-        if byte == 0x01 or byte == 0x02:
-            p = self.getLong3()
-            if p:
-                return self.getString(p)
-            else:
-                return ""
-        else:
-            self.ipdb.seek(-1, 1)
-            return self.getString(offset)
-
-    def getAddr(self, offset, ip=0):
-        self.ipdb.seek(offset + 4)
-        countryAddr = ""
-        areaAddr = ""
-        str = self.ipdb.read(1)
-        (byte,) = struct.unpack('B', str)
-        if byte == 0x01:
-            countryOffset = self.getLong3()
-            self.ipdb.seek(countryOffset)
-            str = self.ipdb.read(1)
-            (b,) = struct.unpack('B', str)
-            if b == 0x02:
-                countryAddr = self.getString(self.getLong3())
-                self.ipdb.seek(countryOffset + 4)
-            else:
-                countryAddr = self.getString(countryOffset)
-            areaAddr = self.getAreaAddr()
-        elif byte == 0x02:
-            countryAddr = self.getString(self.getLong3())
-            areaAddr = self.getAreaAddr(offset + 8)
-        else:
-            countryAddr = self.getString(offset + 4)
-            areaAddr = self.getAreaAddr()
-        return countryAddr + " " + areaAddr
-
-    def dump(self, first, last):
-        if last > self.indexCount:
-            last = self.indexCount
-        for index in range(first, last):
-            offset = self.firstIndex + index * 7
-            self.ipdb.seek(offset)
-            buf = self.ipdb.read(7)
-            (ip, of1, of2) = struct.unpack("IHB", buf)
-            address = self.getAddr(of1 + (of2 << 16))
-            # 把GBK转为utf-8
-            address = unicode(address, 'gbk').encode("utf-8")
-            logger.info("%d %s %s" % (index, self.ip2str(ip), address))
-
-    def setIpRange(self, index):
-        offset = self.firstIndex + index * 7
-        self.ipdb.seek(offset)
-        buf = self.ipdb.read(7)
-        (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
-        self.curEndIpOffset = of1 + (of2 << 16)
-        self.ipdb.seek(self.curEndIpOffset)
-        buf = self.ipdb.read(4)
-        (self.curEndIp,) = struct.unpack("I", buf)
-
-    def getIpAddr(self, ip):
-        L = 0
-        R = self.indexCount - 1
-        while L < R - 1:
-            M = (L + R) / 2
-            self.setIpRange(M)
-            if ip == self.curStartIp:
-                L = M
-                break
-            if ip > self.curStartIp:
-                L = M
-            else:
-                R = M
-        self.setIpRange(L)
-        # version information, 255.255.255.X, urgy but useful
-        if ip & 0xffffff00L == 0xffffff00L:
-            self.setIpRange(R)
-        if self.curStartIp <= ip <= self.curEndIp:
-            address = self.getAddr(self.curEndIpOffset)
-            # 把GBK转为utf-8
-            address = unicode(address, 'gbk')
-        else:
-            address = u"未找到该IP的地址"
-        return address
-
-    def getIpRange(self, ip):
-        self.getIpAddr(ip)
-        range = self.ip2str(self.curStartIp) + ' - ' \
-                + self.ip2str(self.curEndIp)
-        return range
-
-    def getString(self, offset=0):
-        if offset:
-            self.ipdb.seek(offset)
-        str = ""
-        ch = self.ipdb.read(1)
-        (byte,) = struct.unpack('B', ch)
-        while byte != 0:
-            str += ch
-            ch = self.ipdb.read(1)
-            (byte,) = struct.unpack('B', ch)
-        return str
-
-    def ip2str(self, ip):
-        return str(ip >> 24) + '.' + str((ip >> 16) & 0xffL) + '.' + str((ip >> 8) & 0xffL) + '.' + str(ip & 0xffL)
-
-    def str2ip(self, s):
-        (ip,) = struct.unpack('I', socket.inet_aton(s))
-        return ((ip >> 24) & 0xffL) | ((ip & 0xffL) << 24) | ((ip >> 8) & 0xff00L) | ((ip & 0xff00L) << 8)
-
-    def getLong3(self, offset=0):
-        if offset:
-            self.ipdb.seek(offset)
-        str = self.ipdb.read(3)
-        (a, b) = struct.unpack('HB', str)
-        return (b << 16) + a
-
-
-

+ 0 - 11
IPProxyPool_py2/util/logger.py

@@ -1,11 +0,0 @@
-# coding:utf-8
-import logging
-
-__author__ = 'Xaxdus'
-
-logger = logging.getLogger()
-
-
-def logger_proxy(proxy):
-    logger.setLevel(logging.INFO)
-    logger.info(proxy)

+ 0 - 170
IPProxyPool_py2/validator/Validator.py

@@ -1,170 +0,0 @@
-# coding:utf-8
-import json
-from multiprocessing import Process
-
-import gevent
-
-import requests
-import time
-import config
-from db.DataStore import sqlhelper
-from util.exception import Test_URL_Fail
-
-from gevent import monkey
-
-monkey.patch_all()
-
-
-def detect_from_db(myip, proxy, proxies_set):
-    proxy_dict = {'ip': proxy[0], 'port': proxy[1]}
-    result = detect_proxy(myip, proxy_dict)
-    if result:
-        if proxy[2] < 60000:
-            score = proxy[2] + 1
-        else:
-            score = 60000
-        proxy_str = '%s:%s' % (proxy[0], proxy[1])
-        proxies_set.add(proxy_str)
-        sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score})
-    else:
-        sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]})
-
-    pass
-
-
-def validator(queue1, queue2, myip):
-    tasklist = []
-    while True:
-        try:
-            # proxy_dict = {'source':'crawl','data':proxy}
-            proxy = queue1.get(timeout=10)
-            tasklist.append(proxy)
-            if len(tasklist) > 500:
-                p = Process(target=process_start, args=(tasklist, myip, queue2))
-                p.start()
-                tasklist = []
-        except Exception, e:
-            if len(tasklist) > 0:
-                p = Process(target=process_start, args=(tasklist, myip, queue2))
-                p.start()
-                tasklist = []
-
-
-def process_start(tasks, myip, queue2):
-    spawns = []
-    for task in tasks:
-        spawns.append(gevent.spawn(detect_proxy, myip, task, queue2))
-    gevent.joinall(spawns)
-
-
-def detect_proxy(selfip, proxy, queue2=None):
-    '''
-    :param proxy: ip字典
-    :return:
-    '''
-    ip = proxy['ip']
-    port = proxy['port']
-    proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
-    protocol, types, speed = checkProxy(selfip, proxies)
-    if protocol > 0:
-        proxy['protocol'] = protocol
-        proxy['type'] = types
-        proxy['speed'] = speed
-    else:
-        proxy = None
-    if queue2:
-        queue2.put(proxy)
-    return proxy
-
-
-def checkProxy(selfip, proxies):
-    '''
-    用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
-    :param
-    :return:
-    '''
-    protocol = -1
-    types = -1
-    speed = -1
-    http, http_types, http_speed = _checkHttpProxy(selfip, proxies)
-    https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False)
-    if http and https:
-        protocol = 2
-        types = http_types
-        speed = http_speed
-    elif http:
-        types = http_types
-        protocol = 0
-        speed = http_speed
-    elif https:
-        types = https_types
-        protocol = 1
-        speed = https_speed
-    else:
-        types = -1
-        protocol = -1
-        speed = -1
-    return protocol, types, speed
-
-
-def _checkHttpProxy(selfip, proxies, isHttp=True):
-    types = -1
-    speed = -1
-    if isHttp:
-        test_url = config.TEST_HTTP_HEADER
-    else:
-        test_url = config.TEST_HTTPS_HEADER
-    try:
-        start = time.time()
-        r = requests.get(url=test_url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
-        if r.ok:
-            speed = round(time.time() - start, 2)
-            content = json.loads(r.text)
-            headers = content[u'headers']
-            ip = content[u'origin']
-            x_forwarded_for = headers.get(u'X-Forwarded-For', None)
-            x_real_ip = headers.get(u'X-Real-Ip', None)
-            if selfip in ip or ',' in ip:
-                return False, types, speed
-            elif x_forwarded_for is None and x_real_ip is None:
-                types = 0
-            elif selfip not in x_forwarded_for and selfip not in x_real_ip:
-                types = 1
-            else:
-                types = 2
-            return True, types, speed
-        else:
-            return False, types, speed
-
-    except Exception, e:
-        return False, types, speed
-
-
-def getMyIP():
-    try:
-        r = requests.get(url=config.TEST_IP, headers=config.HEADER, timeout=config.TIMEOUT)
-        ip = json.loads(r.text)
-        return ip['origin']
-    except Exception, e:
-        raise Test_URL_Fail
-
-
-if __name__ == '__main__':
-    myip = getMyIP()
-    ip,port = "61.152.81.193",9100
-    proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
-    protocol, types, speed = checkProxy(myip, proxies)
-
-    # ip = '124.88.67.81'
-    # port = '80'
-    # proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
-    # r = requests.get(url=config.TEST_HTTP_HEADER, headers=config.HEADER, timeout=config.TIMEOUT,proxies=proxies)
-    # json = json.loads(r.text)
-    # print json['headers']
-    # print json['origin']
-    # getMyIP()
-
-
-    # j = json.dumps(str)
-    # str = j['ip']
-    # print str

+ 0 - 33
IPProxyPool_py3/IPProxy.py

@@ -1,33 +0,0 @@
-# coding:utf-8
-from multiprocessing import Value, Queue, Process
-from api.apiServer import start_api_server
-import sys
-from db.DataStore import store_data
-from spider.ProxyCrawl import startProxyCrawl
-from validator.Validator import validator, getMyIP
-# import imp
-# imp.reload(sys)
-#sys.setdefaultencoding('utf8')
-
-
-
-
-if __name__ == "__main__":
-    myip = getMyIP()
-    DB_PROXY_NUM = Value('i', 0)
-    q1 = Queue()
-    q2 = Queue()
-    p0 = Process(target=start_api_server)
-    p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM))
-    p2 = Process(target=validator, args=(q1, q2, myip))
-    p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
-    p0.start()
-    p1.start()
-    p2.start()
-    p3.start()
-
-
-
-
-
-

+ 0 - 1
IPProxyPool_py3/api/__init__.py

@@ -1 +0,0 @@
-__author__ = 'Xaxdus'

+ 0 - 43
IPProxyPool_py3/api/apiServer.py

@@ -1,43 +0,0 @@
-# coding:utf-8
-'''
-定义几个关键字,count type,protocol,country,area,
-'''
-import json
-import sys
-import web
-import config
-from db.DataStore import sqlhelper
-from db.SqlHelper import Proxy
-
-urls = (
-    '/', 'select',
-    '/delete', 'delete'
-)
-
-
-def start_api_server():
-    sys.argv.append('0.0.0.0:%s' % config.API_PORT)
-    app = web.application(urls, globals())
-    app.run()
-
-
-class select(object):
-    def GET(self):
-        inputs = web.input()
-        json_result = json.dumps(sqlhelper.select(inputs.get('count', None), inputs))
-        return json_result
-
-
-class delete(object):
-    params = {}
-
-    def GET(self):
-        inputs = web.input()
-        json_result = json.dumps(sqlhelper.delete(inputs))
-        return json_result
-
-
-if __name__ == '__main__':
-    sys.argv.append('0.0.0.0:8000')
-    app = web.application(urls, globals())
-    app.run()

BIN
IPProxyPool_py3/data/qqwry.dat


+ 0 - 22
IPProxyPool_py3/db/ISqlHelper.py

@@ -1,22 +0,0 @@
-# coding:utf-8
-
-class ISqlHelper(object):
-    params = {'ip': None, 'port': None, 'types': None, 'protocol': None, 'country': None, 'area': None}
-
-    def init_db(self):
-        raise NotImplemented
-
-    def drop_db(self):
-        raise NotImplemented
-
-    def insert(self, value=None):
-        raise NotImplemented
-
-    def delete(self, conditions=None):
-        raise NotImplemented
-
-    def update(self, conditions=None, value=None):
-        raise NotImplemented
-
-    def select(self, count=None, conditions=None):
-        raise NotImplemented

+ 0 - 58
IPProxyPool_py3/db/MongoHelper.py

@@ -1,58 +0,0 @@
-import pymongo
-from config import DB_CONFIG
-
-from db.ISqlHelper import ISqlHelper
-
-
-class MongoHelper(ISqlHelper):
-    def __init__(self):
-        self.client = pymongo.MongoClient(DB_CONFIG['DB_CONNECT_STRING'])
-
-    def init_db(self):
-        self.db = self.client.proxy
-        self.proxys = self.db.proxys
-
-
-    def drop_db(self):
-        self.client.drop_database(self.db)
-
-
-    def insert(self, value=None):
-        if value:
-            proxy = dict(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
-                         country=value['country'],
-                         area=value['area'], speed=value['speed'], score=0)
-            self.proxys.insert(proxy)
-
-
-    def delete(self, conditions=None):
-        if conditions:
-            self.proxys.remove(conditions)
-            return ('deleteNum', 'ok')
-        else:
-            return ('deleteNum', 'None')
-
-
-    def update(self, conditions=None, value=None):
-        # update({"UserName":"libing"},{"$set":{"Email":"libing@126.com","Password":"123"}})
-        if conditions and value:
-            self.proxys.update(conditions, {"$set": value})
-            return {'updateNum': 'ok'}
-        else:
-            return {'updateNum': 'fail'}
-
-    def select(self, count=None, conditions=None):
-        if count:
-            count = int(count)
-        else:
-            count = 0
-        items = self.proxys.find(filter=conditions, limit=count).sort(
-            [("speed", pymongo.ASCENDING), ("score", pymongo.DESCENDING)])
-        results = []
-        for item in items:
-            result = (item['ip'], item['port'], item['score'])
-            results.append(result)
-        return results
-
-
-

+ 0 - 1
IPProxyPool_py3/db/__init__.py

@@ -1 +0,0 @@
-__author__ = 'Xaxdus'

+ 0 - 1
IPProxyPool_py3/spider/__init__.py

@@ -1 +0,0 @@
-__author__ = 'Xaxdus'

+ 0 - 1
IPProxyPool_py3/start.bat

@@ -1 +0,0 @@
-python IPProxy.py

+ 0 - 1
IPProxyPool_py3/test/__init__.py

@@ -1 +0,0 @@
-__author__ = 'Xaxdus'

+ 0 - 12
IPProxyPool_py3/test/testsql.py

@@ -1,12 +0,0 @@
-# coding:utf-8
-from db.SqlHelper import SqlHelper
-from util.exception import Con_DB_Fail
-
-try:
-    sqlhelper = SqlHelper()
-    sqlhelper.init_db()
-except Exception, e:
-    raise Con_DB_Fail
-
-proxy = {'ip': '192.168.1.1', 'port': int('80'), 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'四川', 'speed': 0}
-sqlhelper.insert(proxy)

+ 0 - 2
IPProxyPool_py3/util/__init__.py

@@ -1,2 +0,0 @@
-__author__ = 'Xaxdus'
-

+ 0 - 14
IPProxyPool_py3/util/exception.py

@@ -1,14 +0,0 @@
-# coding:utf-8
-import config
-
-
-class Test_URL_Fail(Exception):
-    def __str__(self):
-        str = "访问%s失败,请检查网络连接" % config.TEST_URL
-        return str
-
-
-class Con_DB_Fail(Exception):
-    def __str__(self):
-        str = "使用DB_CONNECT_STRING:%s--连接数据库失败" % config.DB_CONNECT_STRING
-        return str

+ 0 - 1
IPProxyPool_py3/validator/__init__.py

@@ -1 +0,0 @@
-__author__ = 'Xaxdus'

+ 9 - 1
README.md

@@ -201,11 +201,19 @@ print r.text
 ```
 
 ## TODO
-1.可自主选择添加squid反向代理服务器,简化爬虫配置
+1.添加二级代理,简化爬虫配置
 <br/>
 
 
 ## 更新进度
+
+
+-----------------------------2017-1-16----------------------------
+<br/>
+1.将py2和py3版本合并,并且兼容
+<br/>
+2.修复pymongo查询bug
+<br/>
 -----------------------------2017-1-11----------------------------
 <br/>
 1.使用httpbin.org检测代理ip的高匿性

+ 0 - 0
IPProxyPool_py2/api/__init__.py → api/__init__.py


+ 0 - 0
IPProxyPool_py2/api/apiServer.py → api/apiServer.py


+ 0 - 0
IPProxyPool_py3/config.py → config.py


+ 0 - 0
IPProxyPool_py2/data/qqwry.dat → data/qqwry.dat


+ 0 - 0
IPProxyPool_py3/db/DataStore.py → db/DataStore.py


+ 0 - 0
IPProxyPool_py2/db/ISqlHelper.py → db/ISqlHelper.py


+ 20 - 2
IPProxyPool_py2/db/MongoHelper.py → db/MongoHelper.py

@@ -46,7 +46,16 @@ class MongoHelper(ISqlHelper):
             count = int(count)
         else:
             count = 0
-        items = self.proxys.find(filter=conditions, limit=count).sort(
+        if conditions:
+            conditions = dict(conditions)
+            conditions_name = ['types', 'protocol']
+            for condition_name in conditions_name:
+                value = conditions.get(condition_name, None)
+                if value:
+                    conditions[condition_name] = int(value)
+        else:
+            conditions = {}
+        items = self.proxys.find(conditions, limit=count).sort(
             [("speed", pymongo.ASCENDING), ("score", pymongo.DESCENDING)])
         results = []
         for item in items:
@@ -55,4 +64,13 @@ class MongoHelper(ISqlHelper):
         return results
 
 
-
+if __name__ == '__main__':
+    # from db.MongoHelper import MongoHelper as SqlHelper
+    # sqlhelper = SqlHelper()
+    # sqlhelper.init_db()
+    # # print  sqlhelper.select(None,{'types':u'1'})
+    # items= sqlhelper.proxys.find({'types':0})
+    # for item in items:
+    # print item
+    # # # print sqlhelper.select(None,{'types':u'0'})
+    pass

+ 0 - 0
IPProxyPool_py3/db/RedisHelper.py → db/RedisHelper.py


+ 0 - 0
IPProxyPool_py3/db/SqlHelper.py → db/SqlHelper.py


+ 0 - 0
IPProxyPool_py2/db/__init__.py → db/__init__.py


+ 0 - 1
IPProxyPool_py3/spider/HtmlDownloader.py → spider/HtmlDownloader.py

@@ -33,7 +33,6 @@ class Html_Downloader(object):
                     except Exception as e:
                         count += 1
 
-
                 else:
                     return r.text
 

+ 22 - 21
IPProxyPool_py3/spider/HtmlPraser.py → spider/HtmlPraser.py

@@ -1,9 +1,9 @@
 # coding:utf-8
 import base64
 from config import QQWRY_PATH, CHINA_AREA
-
 from util.IPAddress import IPAddresss
 import re
+from util.compatibility import text_
 
 __author__ = 'qiye'
 from lxml import etree
@@ -36,7 +36,7 @@ class Html_Parser(object):
         :return:
         '''
         for area in CHINA_AREA:
-            if addr.find(area) != -1:
+            if text_(area) in addr:
                 return True
         return False
 
@@ -58,16 +58,15 @@ class Html_Parser(object):
                 type = 0
                 protocol = 0
                 addr = self.ips.getIpAddr(self.ips.str2ip(ip))
-                country = ''
-                area = ''
-                if addr.find('省') != -1 or self.AuthCountry(addr):
-                    country = '国内'
+                country = text_('')
+                area = text_('')
+                if text_('省') in addr or self.AuthCountry(addr):
+                    country = text_('国内')
                     area = addr
                 else:
-                    country = '国外'
+                    country = text_('国外')
                     area = addr
             except Exception as e:
-
                 continue
             # updatetime = datetime.datetime.now()
             # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
@@ -97,21 +96,22 @@ class Html_Parser(object):
                     type = 0
                     # if parser['postion']['protocol'] > 0:
                     # protocol = match[parser['postion']['protocol']]
-                    #     if protocol.lower().find('https')!=-1:
+                    # if protocol.lower().find('https')!=-1:
                     #         protocol = 1
                     #     else:
                     #         protocol = 0
                     # else:
                     protocol = 0
                     addr = self.ips.getIpAddr(self.ips.str2ip(ip))
-                    country = ''
-                    area = ''
-                    if addr.find('省') != -1 or self.AuthCountry(addr):
-                        country = '中国'
+                    country = text_('')
+                    area = text_('')
+                    # print(ip,port)
+                    if text_('省') in addr or self.AuthCountry(addr):
+                        country = text_('国内')
                         area = addr
                     else:
-                        country = addr
-                        area = ''
+                        country = text_('国外')
+                        area = addr
                 except Exception as e:
                     continue
 
@@ -150,14 +150,15 @@ class Html_Parser(object):
                     type = 0
                     protocol = 0
                     addr = self.ips.getIpAddr(self.ips.str2ip(ip))
-                    country = ''
-                    area = ''
-                    if addr.find('省') != -1 or self.AuthCountry(addr):
-                        country = '中国'
+                    country = text_('')
+                    area = text_('')
+                    # print(ip,port)
+                    if text_('省') in addr or self.AuthCountry(addr):
+                        country = text_('国内')
                         area = addr
                     else:
-                        country = addr
-                        area = ''
+                        country = text_('国外')
+                        area = addr
                 except Exception as e:
                     continue
                 proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,

+ 0 - 1
IPProxyPool_py3/spider/ProxyCrawl.py → spider/ProxyCrawl.py

@@ -11,7 +11,6 @@ from spider.HtmlDownloader import Html_Downloader
 from spider.HtmlPraser import Html_Parser
 from validator.Validator import validator, getMyIP, detect_from_db
 
-
 __author__ = 'qiye'
 from gevent import monkey
 

+ 0 - 0
IPProxyPool_py2/spider/__init__.py → spider/__init__.py


+ 0 - 0
IPProxyPool_py2/start.bat → start.bat


+ 0 - 0
IPProxyPool_py2/test/__init__.py → test/__init__.py


+ 0 - 0
IPProxyPool_py3/test/test.py → test/test.py


+ 0 - 0
IPProxyPool_py3/test/testIPAddress.py → test/testIPAddress.py


+ 1 - 1
IPProxyPool_py3/test/testIPType.py → test/testIPType.py

@@ -18,7 +18,7 @@ def checkProxyType(selfip, proxies):
         # if r.ok:
         # root = etree.HTML(r.text)
         # ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
-        #     http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
+        # http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
         #     http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
         #     # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
         #     if ip==selfip:

+ 0 - 0
IPProxyPool_py3/test/testbase64.py → test/testbase64.py


+ 1 - 1
IPProxyPool_py3/test/testhttpserver.py → test/testhttpserver.py

@@ -18,7 +18,7 @@ class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
         # message_parts = [
         # 'CLIENT VALUES:',
         # 'client_address=%s (%s)' % (self.client_address,
-        #                                     self.address_string()),
+        # self.address_string()),
         #         'command=%s' % self.command,
         #         'path=%s' % self.path,
         #         'real path=%s' % parsed_path.path,

+ 0 - 0
IPProxyPool_py3/test/testlist.py → test/testlist.py


+ 0 - 0
IPProxyPool_py3/test/testlxml.py → test/testlxml.py


+ 0 - 0
IPProxyPool_py3/test/testqueue.py → test/testqueue.py


+ 0 - 0
IPProxyPool_py2/test/testsql.py → test/testsql.py


+ 7 - 6
IPProxyPool_py3/util/IPAddress.py → util/IPAddress.py

@@ -7,6 +7,7 @@ import socket
 import struct
 
 import logging
+from util.compatibility import text_
 
 logger = logging.getLogger('util')
 
@@ -40,8 +41,8 @@ class IPAddresss:
 
     def getAddr(self, offset, ip=0):
         self.ipdb.seek(offset + 4)
-        countryAddr = ""
-        areaAddr = ""
+        countryAddr = text_("")
+        areaAddr = text_("")
         str = self.ipdb.read(1)
         (byte,) = struct.unpack('B', str)
         if byte == 0x01:
@@ -61,7 +62,7 @@ class IPAddresss:
         else:
             countryAddr = self.getString(offset + 4)
             areaAddr = self.getAreaAddr()
-        return countryAddr + " " + areaAddr
+        return countryAddr + text_(" ") + areaAddr
 
     def dump(self, first, last):
         if last > self.indexCount:
@@ -73,7 +74,7 @@ class IPAddresss:
             (ip, of1, of2) = struct.unpack("IHB", buf)
             address = self.getAddr(of1 + (of2 << 16))
             # 把GBK转为utf-8
-            address = str(address, 'gbk').encode("utf-8")
+            address = text_(address, 'gbk').encode("utf-8")
             logger.info("%d %s %s" % (index, self.ip2str(ip), address))
 
     def setIpRange(self, index):
@@ -106,9 +107,9 @@ class IPAddresss:
         if self.curStartIp <= ip <= self.curEndIp:
             address = self.getAddr(self.curEndIpOffset)
             # 把GBK转为utf-8
-            address = str(address)
+            address = text_(address)
         else:
-            address = "未找到该IP的地址"
+            address = text_("未找到该IP的地址")
         return address
 
     def getIpRange(self, ip):

+ 0 - 0
IPProxyPool_py2/util/__init__.py → util/__init__.py


+ 22 - 0
util/compatibility.py

@@ -0,0 +1,22 @@
+# coding:utf-8
+import sys
+
+PY3 = sys.version_info[0] == 3
+if PY3:
+    text_type = str
+    binary_type = bytes
+else:
+    text_type = unicode
+    binary_type = str
+
+
+def text_(s, encoding='utf-8', errors='strict'):
+    if isinstance(s, binary_type):
+        return s.decode(encoding, errors)
+    return s
+
+
+def bytes_(s, encoding='utf-8', errors='strict'):
+    if isinstance(s, text_type):
+        return s.encode(encoding, errors)
+    return s

+ 0 - 0
IPProxyPool_py2/util/exception.py → util/exception.py


+ 0 - 0
IPProxyPool_py3/util/logger.py → util/logger.py


+ 0 - 3
IPProxyPool_py3/validator/Validator.py → validator/Validator.py

@@ -1,13 +1,10 @@
 # coding:utf-8
 import json
 from multiprocessing import Process
-import re
 import gevent
 
-from lxml import etree
 import requests
 import time
-from config import TEST_URL
 import config
 from db.DataStore import sqlhelper
 from util.exception import Test_URL_Fail

+ 0 - 0
IPProxyPool_py2/validator/__init__.py → validator/__init__.py


Alguns arquivos não foram mostrados porque muitos arquivos mudaram nesse diff