david
/
IPProxyPool
-ын хуулбар https://github.com/qiyeboy/IPProxyPool.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
							#coding:utf-8
import base64
from config import QQWRY_PATH, CHINA_AREA

from util.IPAddress import IPAddresss
import re

__author__ = 'qiye'
from lxml import etree
class Html_Parser(object):

    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)
    def parse(self,response,parser):
        '''

        :param response: 响应
        :param type: 解析方式
        :return:
        '''
        if parser['type']=='xpath':
            return self.XpathPraser(response,parser)
        elif parser['type']=='regular':
            return self.RegularPraser(response,parser)
        elif parser['type']=='module':
            return getattr(self,parser['moduleName'],None)(response,parser)
        else:
            return None

    def AuthCountry(self,addr):
        '''
        用来判断地址是哪个国家的
        :param addr:
        :return:
        '''
        for area in CHINA_AREA:
            if addr.find(area)!=-1:
                return True
        return False


    def XpathPraser(self,response,parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        # print response
        proxylist=[]
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        # print proxys
        for proxy in proxys:
            # print parser['postion']['ip']
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                # print(ip,port)
                 # print proxys
                # type = proxy.xpath(parser['postion']['type'])[0].text
                # # print ip,port,type
                # if type.find(u'高匿')!=-1:
                #     type = 0
                # else:
                #     type = 1
                # protocol=''
                # if len(parser['postion']['protocol']) > 0:
                #     protocol = proxy.xpath(parser['postion']['protocol'])[0].text
                #     if protocol.lower().find('https')!=-1:
                #         protocol = 1
                #     else:
                #         protocol = 0
                # else:
                type=0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = ''
                area = ''
                if addr.find('省')!=-1 or self.AuthCountry(addr):
                    country = '中国'
                    area = addr
                else:
                    country = addr
                    area = ''
            except Exception as e:

                continue
            # updatetime = datetime.datetime.now()
            # ip，端口，类型(0高匿名，1透明)，protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy ={'ip':ip,'port':int(port),'types':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
            proxylist.append(proxy)
        return proxylist

    def RegularPraser(self,response,parser):
        '''
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist=[]
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs !=None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    #网站的类型一直不靠谱所以还是默认，之后会检测
                    type =0
                    # if parser['postion']['protocol'] > 0:
                    #     protocol = match[parser['postion']['protocol']]
                    #     if protocol.lower().find('https')!=-1:
                    #         protocol = 1
                    #     else:
                    #         protocol = 0
                    # else:
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = ''
                    area = ''
                    if addr.find('省')!=-1 or self.AuthCountry(addr):
                        country = '中国'
                        area = addr
                    else:
                        country = addr
                        area = ''
                except Exception as e:
                    continue

                proxy ={'ip':ip,'port':port,'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}

                proxylist.append(proxy)
            return proxylist


    def CnproxyPraser(self,response,parser):
        proxylist = self.RegularPraser(response,parser)
        chardict ={'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1'}

        for proxy in proxylist:
            port = proxy['port']
            new_port = ''
            for i in range(len(port)):
                if port[i]!='+':
                   new_port += chardict[port[i]]
            new_port = int(new_port)
            proxy['port'] =new_port
        return proxylist


    def proxy_listPraser(self,response,parser):
        proxylist=[]
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs:
            for match in matchs:
                try:
                    ip_port = base64.b64decode(match.replace("Proxy('","").replace("')",""))
                    ip = ip_port.split(':')[0]
                    port = ip_port.split(':')[1]
                    type =0
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = ''
                    area = ''
                    if addr.find('省')!=-1 or self.AuthCountry(addr):
                        country = '中国'
                        area = addr
                    else:
                        country = addr
                        area = ''
                except Exception as e:
                    continue
                proxy ={'ip':ip,'port':int(port),'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
                proxylist.append(proxy)
            return proxylist