qiyeboy 8 rokov pred
rodič
commit
2b3ffbcbbf

+ 8 - 0
.idea/IPProxys.iml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 2.7.10 (D:\Python27\python.exe)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

+ 4 - 0
.idea/encodings.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
+</project>

+ 27 - 0
.idea/misc.xml

@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DaemonCodeAnalyzer">
+    <disable_hints />
+  </component>
+  <component name="DependencyValidationManager">
+    <option name="SKIP_IMPORT_STATEMENTS" value="false" />
+  </component>
+  <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="false">
+    <OptionsSetting value="true" id="Add" />
+    <OptionsSetting value="true" id="Remove" />
+    <OptionsSetting value="true" id="Checkout" />
+    <OptionsSetting value="true" id="Update" />
+    <OptionsSetting value="true" id="Status" />
+    <OptionsSetting value="true" id="Edit" />
+    <ConfirmationsSetting value="0" id="Add" />
+    <ConfirmationsSetting value="0" id="Remove" />
+  </component>
+  <component name="ProjectModuleManager">
+    <modules />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 (D:\Python27\python.exe)" project-jdk-type="Python SDK" />
+  <component name="RunManager">
+    <list size="0" />
+  </component>
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/IPProxys.iml" filepath="$PROJECT_DIR$/.idea/IPProxys.iml" />
+    </modules>
+  </component>
+</project>

+ 5 - 0
.idea/scopes/scope_settings.xml

@@ -0,0 +1,5 @@
+<component name="DependencyValidationManager">
+  <state>
+    <option name="SKIP_IMPORT_STATEMENTS" value="false" />
+  </state>
+</component>

+ 6 - 0
.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="" />
+  </component>
+</project>

+ 49 - 0
.idea/workspace.xml

@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <option name="TRACKING_ENABLED" value="true" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="ChangesViewManager" flattened_view="true" show_ignored="false" />
+  <component name="CreatePatchCommitExecutor">
+    <option name="PATCH_PATH" value="" />
+  </component>
+  <component name="DaemonCodeAnalyzer">
+    <disable_hints />
+  </component>
+  <component name="ProjectLevelVcsManager" settingsEditedManually="false">
+    <OptionsSetting value="true" id="Add" />
+    <OptionsSetting value="true" id="Remove" />
+    <OptionsSetting value="true" id="Checkout" />
+    <OptionsSetting value="true" id="Update" />
+    <OptionsSetting value="true" id="Status" />
+    <OptionsSetting value="true" id="Edit" />
+    <ConfirmationsSetting value="0" id="Add" />
+    <ConfirmationsSetting value="0" id="Remove" />
+  </component>
+  <component name="RunManager">
+    <list size="0" />
+  </component>
+  <component name="ShelveChangesManager" show_recycled="false" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <option name="number" value="Default" />
+    </task>
+    <servers />
+  </component>
+  <component name="VcsContentAnnotationSettings">
+    <option name="myLimit" value="2678400000" />
+  </component>
+  <component name="VcsManagerConfiguration">
+    <option name="myTodoPanelSettings">
+      <TodoPanelSettings />
+    </option>
+  </component>
+  <component name="XDebuggerManager">
+    <breakpoint-manager />
+    <watches-manager />
+  </component>
+</project>

+ 40 - 0
IPProxys.py

@@ -0,0 +1,40 @@
+#coding:utf-8
+import BaseHTTPServer
+import threading
+from api.apiServer import WebRequestHandler
+from config import API_PORT
+from db.SQLiteHelper import SqliteHelper
+from spider.ProxySpider import ProxySpider
+
+class IPProxys(object):
+
+    def startApiServer(self):
+        '''
+        启动api服务器
+        :return:
+        '''
+        server = BaseHTTPServer.HTTPServer(('0.0.0.0',API_PORT), WebRequestHandler)
+        server.serve_forever()
+
+
+
+    def startSpider(self):
+        print 'start  run  ----'
+        spider = ProxySpider()
+        spider.run()
+
+
+
+if __name__=="__main__":
+
+    proxys = IPProxys()
+    apiServer = threading.Thread(target=proxys.startApiServer)
+    spider = threading.Thread(target=proxys.startSpider)
+    apiServer.start()
+    spider.start()
+
+
+
+
+
+

+ 1 - 0
api/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 62 - 0
api/apiServer.py

@@ -0,0 +1,62 @@
+#coding:utf-8
+'''
+定义几个关键字,count type,protocol,country,area,
+'''
+import urllib
+from config import API_PORT
+from db.SQLiteHelper import SqliteHelper
+
+__author__ = 'Xaxdus'
+
+import BaseHTTPServer
+import json
+import urlparse
+
+# keylist=['count', 'types','protocol','country','area']
+class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
+
+    def do_GET(self):
+        """
+        """
+        dict={}
+
+        parsed_path = urlparse.urlparse(self.path)
+        try:
+            query = urllib.unquote(parsed_path.query)
+            print query
+            if query.find('&')!=-1:
+                params = query.split('&')
+                for param in params:
+                    dict[param.split('=')[0]]=param.split('=')[1]
+            else:
+                    dict[query.split('=')[0]]=query.split('=')[1]
+            str_count=''
+            conditions=[]
+            for key in dict:
+                if key =='count':
+                    str_count = 'lIMIT 0,%s'% dict[key]
+                if key =='country' or key =='area':
+                    conditions .append(key+" LIKE '"+dict[key]+"%'")
+                elif key =='types' or key =='protocol' or key =='country' or key =='area':
+                    conditions .append(key+"="+dict[key])
+            if len(conditions)>1:
+                conditions = ' AND '.join(conditions)
+            else:
+                conditions =conditions[0]
+            sqlHelper = SqliteHelper()
+            result = sqlHelper.select(sqlHelper.tableName,conditions,str_count)
+            # print type(result)
+            # for r in  result:
+            #     print r
+            print result
+            data = json.dumps(result)
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(data)
+        except Exception,e:
+            print e
+            self.send_response(404)
+
+if __name__=='__main__':
+    server = BaseHTTPServer.HTTPServer(('0.0.0.0',API_PORT), WebRequestHandler)
+    server.serve_forever()

+ 150 - 0
config.py

@@ -0,0 +1,150 @@
+#coding:utf-8
+'''
+定义规则 urls:url列表
+         type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
+         patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应
+'''
+import random
+
+'''
+ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
+ speed(连接速度)
+'''
+parserList = [
+        {
+            'urls': ['http://www.66ip.cn/%s.html'% n for n in ['index']+range(2,12)],
+            'type':'xpath',
+            'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
+            'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''}
+        },
+        {
+            'urls': ['http://www.66ip.cn/areaindex_%s/%s.html'%(m,n) for m in range(1,35) for n in range(1,10)],
+            'type':'xpath',
+            'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
+            'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''}
+        },
+        {
+            'urls': ['http://www.kuaidaili.com/proxylist/%s/'% n for n in range(1,11)],
+            'type': 'xpath',
+            'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
+            'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
+        },
+        {
+            'urls': ['http://www.kuaidaili.com/free/%s/%s/'% (m,n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in range(1,11)],
+            'type':'xpath',
+            'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
+            'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
+        },
+        {
+            'urls': ['http://www.cz88.net/proxy/%s'% m for m in ['index.shtml']+['http_%s.shtml' % n for n in range(2, 11)]],
+            'type':'xpath',
+            'pattern':".//*[@id='boxright']/div/ul/li[position()>1]",
+            'postion':{'ip':'./div[1]','port':'./div[2]','type':'./div[3]','protocol':''}
+
+        },
+        {
+            'urls': ['http://www.ip181.com/daili/%s.html'% n for n in range(1, 11)],
+            'type':'xpath',
+            'pattern': "html/body/div[2]/div/div[2]/div/div[3]/table/tbody/tr[position()>1]",
+            'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
+
+        },
+        {
+            'urls': ['http://www.xicidaili.com/%s/%s'%(m,n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8) ],
+            'type':'xpath',
+            'pattern': ".//*[@id='ip_list']/tr[position()>1]",
+            'postion':{'ip':'./td[2]','port':'./td[3]','type':'./td[5]','protocol':'./td[6]'}
+        }
+
+        ]
+'''
+数据库的配置
+'''
+DB_CONFIG={
+    'dbType':'sqlite',#sqlite,mysql,mongodb
+    'dbPath':'./data/proxy.db',#这个仅仅对sqlite有效
+    'dbUser':'',#用户名
+    'dbPass':'',#密码
+    'dbName':''#数据库名称
+
+}
+
+CHINA_AREA=[u'河北',u'山东',u'辽宁',u'黑龙江',u'吉林'
+    ,u'甘肃',u'青海',u'河南',u'江苏',u'湖北',u'湖南',
+            u'江西',u'浙江',u'广东',u'云南',u'福建',
+            u'台湾',u'海南',u'山西',u'四川',u'陕西',
+            u'贵州',u'安徽',u'重庆',u'北京',u'上海',u'天津',u'广西',u'内蒙',u'西藏',u'新疆',u'宁夏',u'香港',u'澳门']
+QQWRY_PATH="./data/qqwry.dat"
+
+THREADNUM = 20
+API_PORT=8000
+'''
+爬虫爬取和检测ip的设置条件
+不需要检测ip是否已经存在,因为会定时清理
+'''
+UPDATE_TIME=30*60#每半个小时检测一次是否有代理ip失效
+MINNUM = 500 #当有效的ip值小于一个时 需要启动爬虫进行爬取
+MAXTIME = 24*60 #当爬取存储开始一直使用的最大时间,如果超过这个时间,都删除
+
+TIMEOUT = 5#socket延时
+
+
+
+'''
+反爬虫的设置
+'''
+'''
+重试次数
+'''
+RETRY_TIME=3
+
+
+'''
+USER_AGENTS 随机头信息
+'''
+USER_AGENTS = [
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
+    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
+    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
+    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
+    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
+    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
+    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
+    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
+    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
+    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
+]
+
+HEADER = {
+    'User-Agent': random.choice(USER_AGENTS),
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Connection': 'keep-alive',
+    'Accept-Encoding': 'gzip, deflate',
+}
+
+TEST_URL='http://www.ip138.com/'

BIN
data/proxy.db


BIN
data/qqwry.dat


+ 109 - 0
db/SQLiteHelper.py

@@ -0,0 +1,109 @@
+#coding:utf-8
+from config import DB_CONFIG
+from db.SqlHelper import SqlHelper
+
+__author__ = 'Xaxdus'
+import sqlite3
+class SqliteHelper(SqlHelper):
+
+    tableName='proxys'
+    def __init__(self):
+        '''
+        建立数据库的链接
+        :return:
+        '''
+        self.database = sqlite3.connect(DB_CONFIG['dbPath'],check_same_thread=False)
+        self.cursor = self.database.cursor()
+        #创建表结构
+        self.createTable()
+
+
+    def createTable(self):
+        self.cursor.execute("create TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY ,ip VARCHAR(16) NOT NULL,"
+               "port INTEGER NOT NULL ,types INTEGER NOT NULL ,protocol INTEGER NOT NULL DEFAULT 0,"
+               "country VARCHAR (20) NOT NULL,area VARCHAR (20) NOT NULL,updatetime TimeStamp NOT NULL DEFAULT (datetime('now','localtime')) ,speed DECIMAL(3,2) NOT NULL DEFAULT 100)"% self.tableName)
+
+        self.database.commit()
+
+    def select(self,tableName,condition,count):
+        '''
+
+        :param tableName: 表名
+        :param condition: 条件包含占位符
+        :param value:  占位符所对应的值(主要是为了防注入)
+        :return:
+        '''
+        command = 'SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC %s '%(tableName,condition,count)
+
+        self.cursor.execute(command)
+        result = self.cursor.fetchall()
+        return result
+
+    def selectAll(self):
+        self.cursor.execute('SELECT DISTINCT ip,port FROM %s ORDER BY speed ASC '%self.tableName)
+        result = self.cursor.fetchall()
+        return result
+
+    def selectCount(self):
+        self.cursor.execute('SELECT COUNT( DISTINCT ip) FROM %s'%self.tableName)
+        count = self.cursor.fetchone()
+        return count
+
+    def selectOne(self,tableName,condition,value):
+        '''
+
+        :param tableName: 表名
+        :param condition: 条件包含占位符
+        :param value:  占位符所对应的值(主要是为了防注入)
+        :return:
+        '''
+        self.cursor.execute('SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC'%(tableName,condition),value)
+        result = self.cursor.fetchone()
+        return result
+
+    def update(self,tableName,condition,value):
+        self.cursor.execute('UPDATE %s %s'%(tableName,condition),value)
+        self.database.commit()
+
+    def delete(self,tableName,condition):
+        '''
+
+        :param tableName: 表名
+        :param condition: 条件
+        :return:
+        '''
+        deleCommand = 'DELETE FROM %s WHERE %s'%(tableName,condition)
+        # print deleCommand
+        self.cursor.execute(deleCommand)
+        self.commit()
+
+    def commit(self):
+        self.database.commit()
+
+
+    def insert(self,tableName,value):
+
+        proxy = [value['ip'],value['port'],value['type'],value['protocol'],value['country'],value['area'],value['speed']]
+        # print proxy
+        self.cursor.execute("INSERT INTO %s (ip,port,types,protocol,country,area,speed)VALUES (?,?,?,?,?,?,?)"% tableName
+                            ,proxy)
+
+
+    def batch_insert(self,tableName,values):
+
+        for value in values:
+            if value!=None:
+                self.insert(self.tableName,value)
+        self.database.commit()
+
+
+    def close(self):
+        self.cursor.close()
+        self.database.close()
+
+
+
+if __name__=="__main__":
+    s = SqliteHelper()
+    print s.selectCount()[0]
+    # print s.selectAll()

+ 35 - 0
db/SqlHelper.py

@@ -0,0 +1,35 @@
+#coding:utf-8
+
+__author__ = 'Xaxdus'
+'''
+sql操作的基类
+包括ip,端口,types类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
+ speed(连接速度)
+'''
+class SqlHelper(object):
+
+
+
+    def __init__(self):
+        pass
+
+    def insert(self,value):
+        pass
+
+    def batch_insert(self,values):
+        pass
+
+    def delete(self,condition):
+        pass
+
+    def batch_delete(self,values):
+        pass
+
+    def update(self,condition,value):
+        pass
+    def select(self,condition):
+        pass
+    def selectOne(self,tableName,condition,value):
+        pass
+    def close(self):
+        pass

+ 1 - 0
db/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 70 - 0
spider/HtmlDownLoader.py

@@ -0,0 +1,70 @@
+#coding:utf-8
+import random
+import config
+import json
+__author__ = 'Xaxdus'
+import requests
+class Html_Downloader(object):
+
+    @classmethod
+    def download(self,url):
+        count = 0#重试次数
+        r=''
+        try:
+            r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT)
+            r.encoding ='gbk'
+            while count< config.RETRY_TIME:
+                if (not r.ok) or len(r.content)<500 :
+                    response = requests.get("http://127.0.0.1:%s/?types=0&count=10"%config.API_PORT)
+                    if response.ok:
+                        content =  response.text
+                        choose = random.choice(json.loads(content))
+                        proxies={"https": "http://%s:%s"%(choose[0],choose[1])}
+                        try:
+                            r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+                            r.encoding ='gbk'
+                            count += 1
+                        except Exception,e:
+                             count += 1
+                    else:
+                        return None
+
+                else:
+                    return r.text
+
+            return None
+
+
+        except Exception,e:
+            while count< config.RETRY_TIME:
+                if r==''or (not r.ok) or len(r.content)<500 :
+                    try:
+                        response = requests.get("http://127.0.0.1:%s/?types=0&count=10"%config.API_PORT)
+                        if response.ok:
+                            content =  response.text
+                            choose = random.choice(json.loads(content))
+                            proxies={"https": "http://%s:%s"%(choose[0],choose[1])}
+                            try:
+                                r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+                                r.encoding ='gbk'
+                                count += 1
+                            except Exception,e:
+                                 count += 1
+                        else:
+                            return None
+                    except Exception,e:
+                        return None
+
+                else:
+                    return r.text
+
+            return None
+
+
+
+
+
+
+
+
+

+ 85 - 0
spider/HtmlPraser.py

@@ -0,0 +1,85 @@
+#coding:utf-8
+import datetime
+from config import QQWRY_PATH, CHINA_AREA
+
+from util.IPAddress import IPAddresss
+from util.logger import logger
+
+__author__ = 'Xaxdus'
+from lxml import etree
+class Html_Parser(object):
+
+    def __init__(self):
+        self.ips = IPAddresss(QQWRY_PATH)
+    def parse(self,response,parser):
+        '''
+
+        :param response: 响应
+        :param type: 解析方式
+        :return:
+        '''
+        if parser['type']=='xpath':
+            proxylist=[]
+            root = etree.HTML(response)
+            proxys = root.xpath(parser['pattern'])
+            for proxy in proxys:
+                # print parser['postion']['ip']
+                ip = proxy.xpath(parser['postion']['ip'])[0].text
+                port = proxy.xpath(parser['postion']['port'])[0].text
+                type = proxy.xpath(parser['postion']['type'])[0].text
+                if type.find(u'高匿')!=-1:
+                    type = 0
+                else:
+                    type = 1
+                protocol=''
+                if len(parser['postion']['protocol']) > 0:
+                    protocol = proxy.xpath(parser['postion']['protocol'])[0].text
+                    if protocol.lower().find('https')!=-1:
+                        protocol = 1
+                    else:
+                        protocol = 0
+                else:
+                    protocol = 0
+                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
+                country = ''
+                area = ''
+                if addr.find(u'省')!=-1 or self.AuthCountry(addr):
+                    country = u'中国'
+                    area = addr
+                else:
+                    country = addr
+                    area = ''
+                # updatetime = datetime.datetime.now()
+                # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
+
+                # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
+                proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
+                print proxy
+                proxylist.append(proxy)
+
+            return proxylist
+
+    def AuthCountry(self,addr):
+        '''
+        用来判断地址是哪个国家的
+        :param addr:
+        :return:
+        '''
+        for area in CHINA_AREA:
+            if addr.find(area)!=-1:
+                return True
+        return False
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ 77 - 0
spider/ProxySpider.py

@@ -0,0 +1,77 @@
+#coding:utf-8
+from gevent.pool import Pool
+import requests
+import time
+from config import THREADNUM, parserList, MINNUM, UPDATE_TIME
+from db.SQLiteHelper import SqliteHelper
+from spider.HtmlDownLoader import Html_Downloader
+from spider.HtmlPraser import Html_Parser
+from validator.Validator import Validator
+
+
+__author__ = 'Xaxdus'
+from gevent import monkey
+monkey.patch_all()
+'''
+这个类的作用是描述爬虫的逻辑
+'''
+class ProxySpider(object):
+
+    def __init__(self):
+        self.crawl_pool = Pool(THREADNUM)
+        # self.sqlHelper = sqlHelper
+
+    def run(self):
+        while True:
+            print 'spider beginning -------'
+            sqlHelper = SqliteHelper()
+            print 'validator beginning -------'
+            validator = Validator(sqlHelper)
+            count = validator.run_db()
+            print 'validator end ----count=%s'%count
+            if count[0]< MINNUM:
+                proxys = self.crawl_pool.map(self.crawl,parserList)
+                #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
+                # print proxys
+                #这个时候应该去重:
+
+                proxys_tmp = []
+                for proxy in proxys:
+                    proxys_tmp.extend(proxy)
+
+                proxys = proxys_tmp
+                print 'first_proxys--%s',len(proxys)
+                #这个时候proxys的格式是[{},{},{},{},{},{}]
+                proxys_tmp=None
+                #这个时候开始去重:
+                proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])]
+                print 'end_proxys--%s',len(proxys)
+                print 'spider proxys -------%s'%type(proxys)
+                proxys = validator.run_list(proxys)#这个是检测后的ip地址
+
+
+                sqlHelper.batch_insert(sqlHelper.tableName,proxys)
+
+
+                print 'success ip =%s'%sqlHelper.selectCount()
+                sqlHelper.close()
+            print 'spider end -------'
+            time.sleep(UPDATE_TIME)
+
+
+    def crawl(self,parser):
+        proxys = []
+        html_parser = Html_Parser()
+        for url in parser['urls']:
+           response = Html_Downloader.download(url)
+           # print response
+           if response!=None:
+               proxylist= html_parser.parse(response,parser)
+               if proxylist != None:
+                  proxys.extend(proxylist)
+        return proxys
+
+
+if __name__=="__main__":
+    spider = ProxySpider()
+    spider.run()

+ 1 - 0
spider/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 1 - 0
start.bat

@@ -0,0 +1 @@
+python IPProxys.py

+ 1 - 0
test/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

+ 43 - 0
test/testhttpserver.py

@@ -0,0 +1,43 @@
+
+#coding:utf-8
+import BaseHTTPServer
+import json
+import urlparse
+class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
+    def do_GET(self):
+        """
+        """
+        print self.path
+        parsed_path = urlparse.urlparse(self.path)
+        print parsed_path
+        print parsed_path.query
+        # message_parts = [
+        #         'CLIENT VALUES:',
+        #         'client_address=%s (%s)' % (self.client_address,
+        #                                     self.address_string()),
+        #         'command=%s' % self.command,
+        #         'path=%s' % self.path,
+        #         'real path=%s' % parsed_path.path,
+        #         'query=%s' % parsed_path.query,
+        #         'request_version=%s' % self.request_version,
+        #         '',
+        #         'SERVER VALUES:',
+        #         'server_version=%s' % self.server_version,
+        #         'sys_version=%s' % self.sys_version,
+        #         'protocol_version=%s' % self.protocol_version,
+        #         '',
+        #         'HEADERS RECEIVED:',
+        #         ]
+        # for name, value in sorted(self.headers.items()):
+        #     message_parts.append('%s=%s' % (name, value.rstrip()))
+        # message_parts.append('')
+        # message = '\r\n'.join(message_parts)
+        data1 = [{'ip':'192.168.0.0','port':456}]*10
+        d1 = json.dumps(data1,sort_keys=True,indent=4)
+        message=('192.168.1.1',80)
+        self.send_response(200)
+        self.end_headers()
+        self.wfile.write(d1)
+
+server = BaseHTTPServer.HTTPServer(('0.0.0.0',8000), WebRequestHandler)
+server.serve_forever()

+ 36 - 0
test/testlist.py

@@ -0,0 +1,36 @@
+#coding:utf-8
+from decimal import Decimal
+
+__author__ = 'Xaxdus'
+
+
+# list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)]
+#
+# list = [(1,10)]*10
+#
+# for m,n in list:
+#     print m,n
+#
+#
+# list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list]
+# print list2
+
+# x=Decimal('0.998531571219').quantize(Decimal('0.00'))
+# a= 0.998531571219
+# value = round(a, 3)
+# print x,type(x),value
+# proxys=[]
+# proxy=[123,1234]
+# proxys.append(proxy)
+#
+# proxy=[123,1234]
+# proxys.append(proxy)
+#
+# print proxys
+l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}]
+
+# for d in l:
+#    print  [tuple(d.items())]
+print [tuple(d.items()) for d in l]
+
+print [dict(t) for t in set([tuple(d.items()) for d in l])]

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 127 - 0
test/testlxml.py


+ 143 - 0
util/IPAddress.py

@@ -0,0 +1,143 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+
+import socket
+import struct
+
+
+class IPAddresss:
+    def __init__(self, ipdbFile):
+        self.ipdb = open(ipdbFile, "rb")
+        str = self.ipdb.read(8)
+        (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
+        self.indexCount = (self.lastIndex - self.firstIndex)/7+1
+        # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
+
+    def getVersion(self):
+        s = self.getIpAddr(0xffffff00L)
+        return s
+
+    def getAreaAddr(self, offset=0):
+        if offset:
+            self.ipdb.seek(offset)
+        str = self.ipdb.read(1)
+        (byte,) = struct.unpack('B', str)
+        if byte == 0x01 or byte == 0x02:
+            p = self.getLong3()
+            if p:
+                return self.getString(p)
+            else:
+                return ""
+        else:
+            self.ipdb.seek(-1, 1)
+            return self.getString(offset)
+
+    def getAddr(self, offset, ip=0):
+        self.ipdb.seek(offset + 4)
+        countryAddr = ""
+        areaAddr = ""
+        str = self.ipdb.read(1)
+        (byte,) = struct.unpack('B', str)
+        if byte == 0x01:
+            countryOffset = self.getLong3()
+            self.ipdb.seek(countryOffset)
+            str = self.ipdb.read(1)
+            (b,) = struct.unpack('B', str)
+            if b == 0x02:
+                countryAddr = self.getString(self.getLong3())
+                self.ipdb.seek(countryOffset + 4)
+            else:
+                countryAddr = self.getString(countryOffset)
+            areaAddr = self.getAreaAddr()
+        elif byte == 0x02:
+            countryAddr = self.getString(self.getLong3())
+            areaAddr = self.getAreaAddr(offset + 8)
+        else:
+            countryAddr = self.getString(offset + 4)
+            areaAddr = self.getAreaAddr()
+        return countryAddr + " " + areaAddr
+
+    def dump(self, first , last):
+        if last > self.indexCount :
+            last = self.indexCount
+        for index in range(first, last):
+            offset = self.firstIndex + index * 7
+            self.ipdb.seek(offset)
+            buf = self.ipdb.read(7)
+            (ip, of1, of2) = struct.unpack("IHB", buf)
+            address = self.getAddr(of1 + (of2 << 16))
+            # 把GBK转为utf-8
+            address = unicode(address, 'gbk').encode("utf-8")
+            print "%d\t%s\t%s" % (index, self.ip2str(ip), address)
+
+    def setIpRange(self, index):
+        offset = self.firstIndex + index * 7
+        self.ipdb.seek(offset)
+        buf = self.ipdb.read(7)
+        (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
+        self.curEndIpOffset = of1 + (of2 << 16)
+        self.ipdb.seek(self.curEndIpOffset)
+        buf = self.ipdb.read(4)
+        (self.curEndIp,) = struct.unpack("I", buf)
+
+    def getIpAddr(self, ip):
+        L = 0
+        R = self.indexCount - 1
+        while L < R-1:
+            M = (L + R) / 2
+            self.setIpRange(M)
+            if ip == self.curStartIp:
+                L = M
+                break
+            if ip > self.curStartIp:
+                L = M
+            else:
+                R = M
+        self.setIpRange(L)
+        # version information, 255.255.255.X, urgy but useful
+        if ip & 0xffffff00L == 0xffffff00L:
+            self.setIpRange(R)
+        if self.curStartIp <= ip <= self.curEndIp:
+            address = self.getAddr(self.curEndIpOffset)
+            # 把GBK转为utf-8
+            address = unicode(address, 'gbk')
+        else:
+            address = u"未找到该IP的地址"
+        return address
+
+    def getIpRange(self, ip):
+        self.getIpAddr(ip)
+        range = self.ip2str(self.curStartIp) + ' - ' \
+            + self.ip2str(self.curEndIp)
+        return range
+
+    def getString(self, offset = 0):
+        if offset :
+            self.ipdb.seek(offset)
+        str = ""
+        ch = self.ipdb.read(1)
+        (byte,) = struct.unpack('B', ch)
+        while byte != 0:
+            str += ch
+            ch = self.ipdb.read(1)
+            (byte,) = struct.unpack('B', ch)
+        return str
+
+    def ip2str(self, ip):
+        return str(ip >> 24)+'.'+str((ip >> 16) & 0xffL)+'.'+str((ip >> 8) & 0xffL)+'.'+str(ip & 0xffL)
+
+    def str2ip(self, s):
+        (ip,) = struct.unpack('I', socket.inet_aton(s))
+        return ((ip >> 24) & 0xffL) | ((ip & 0xffL) << 24) | ((ip >> 8) & 0xff00L) | ((ip & 0xff00L) << 8)
+
+    def getLong3(self, offset=0):
+        if offset:
+            self.ipdb.seek(offset)
+        str = self.ipdb.read(3)
+        (a, b) = struct.unpack('HB', str)
+        return (b << 16) + a
+
+
+

+ 4 - 0
util/__init__.py

@@ -0,0 +1,4 @@
+
+
+__author__ = 'Xaxdus'
+

+ 10 - 0
util/logger.py

@@ -0,0 +1,10 @@
+#coding:utf-8
+import logging
+
+__author__ = 'Xaxdus'
+
+
+logger = logging.getLogger()
+def logger_proxy(proxy):
+   logger.setLevel(logging.INFO)
+   logger.info(proxy)

+ 136 - 0
validator/Validator.py

@@ -0,0 +1,136 @@
+#coding:utf-8
+import datetime
+from gevent.pool import Pool
+import requests
+import time
+from config import TEST_URL
+import config
+from db.SQLiteHelper import SqliteHelper
+from gevent import monkey
+monkey.patch_all()
+__author__ = 'Xaxdus'
+
+class Validator(object):
+
+    def __init__(self):
+        self.detect_pool = Pool(config.THREADNUM)
+
+
+    def __init__(self,sqlHelper):
+        self.detect_pool = Pool(config.THREADNUM)
+        self.sqlHelper =sqlHelper
+
+
+    def run_db(self):
+        '''
+        从数据库中检测
+        :return:
+        '''
+        try:
+            #首先将超时的全部删除
+            self.deleteOld()
+            #接着将重复的删除掉
+
+            #接着检测剩余的ip,是否可用
+            results = self.sqlHelper.selectAll()
+            self.detect_pool.map(self.detect_db,results)
+            return self.sqlHelper.selectCount()#返回最终的数量
+        except Exception,e:
+            print e
+            return 0
+
+
+
+    def run_list(self,results):
+        '''
+        这个是先不进入数据库,直接从集合中删除
+        :param results:
+        :return:
+        '''
+        # proxys=[]
+        # for result in results:
+        proxys = self.detect_pool.map(self.detect_list,results)
+        #这个时候proxys的格式是[{},{},{},{},{}]
+        return proxys
+
+
+
+
+
+
+    def deleteOld(self):
+        '''
+        删除旧的数据
+        :return:
+        '''
+        condition = "updatetime<'%s'"%((datetime.datetime.now() - datetime.timedelta(minutes=config.MAXTIME)).strftime('%Y-%m-%d %H:%M:%S'))
+        self.sqlHelper.delete(SqliteHelper.tableName,condition)
+
+
+
+
+
+    def detect_db(self,result):
+        '''
+
+        :param result: 从数据库中检测
+        :return:
+        '''
+        ip = result[0]
+        port = str(result[1])
+        proxies={"http": "http://%s:%s"%(ip,port)}
+        start = time.time()
+        try:
+            r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+
+            if not r.ok:
+                condition = "ip='"+ip+"' AND "+'port='+port
+                print 'fail ip =%s'%ip
+                self.sqlHelper.delete(SqliteHelper.tableName,condition)
+            else:
+                speed = round(time.time()-start, 2)
+                self.sqlHelper.update(SqliteHelper.tableName,'SET speed=? WHERE ip=? AND port=?',(speed,ip,port))
+                print 'success ip =%s,speed=%s'%(ip,speed)
+        except Exception,e:
+                condition = "ip='"+ip+"' AND "+'port='+port
+                print 'fail ip =%s'%ip
+                self.sqlHelper.delete(SqliteHelper.tableName,condition)
+
+
+
+    def detect_list(self,proxy):
+        '''
+        :param proxy: ip字典
+        :return:
+        '''
+        # for proxy in proxys:
+
+        ip = proxy['ip']
+        port = proxy['port']
+        proxies={"http": "http://%s:%s"%(ip,port)}
+        start = time.time()
+        try:
+            r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
+
+            if not r.ok:
+                print 'fail ip =%s'%ip
+                proxy = None
+
+            else:
+                speed = round(time.time()-start,2)
+                print 'success ip =%s,speed=%s'%(ip,speed)
+                proxy['speed']=speed
+                # return proxy
+        except Exception,e:
+                print 'fail ip =%s'%ip
+                proxy = None
+        return proxy
+        # return proxys
+
+
+if __name__=='__main__':
+    # v = Validator()
+    # results=[{'ip':'192.168.1.1','port':80}]*10
+    # results = v.run(results)
+    # print results
+    pass

+ 1 - 0
validator/__init__.py

@@ -0,0 +1 @@
+__author__ = 'Xaxdus'

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov