1
0

HtmlPraser.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. #coding:utf-8
  2. import base64
  3. from config import QQWRY_PATH, CHINA_AREA
  4. from util.IPAddress import IPAddresss
  5. import re
  6. __author__ = 'qiye'
  7. from lxml import etree
  8. class Html_Parser(object):
  9. def __init__(self):
  10. self.ips = IPAddresss(QQWRY_PATH)
  11. def parse(self,response,parser):
  12. '''
  13. :param response: 响应
  14. :param type: 解析方式
  15. :return:
  16. '''
  17. if parser['type']=='xpath':
  18. return self.XpathPraser(response,parser)
  19. elif parser['type']=='regular':
  20. return self.RegularPraser(response,parser)
  21. elif parser['type']=='module':
  22. return getattr(self,parser['moduleName'],None)(response,parser)
  23. else:
  24. return None
  25. def AuthCountry(self,addr):
  26. '''
  27. 用来判断地址是哪个国家的
  28. :param addr:
  29. :return:
  30. '''
  31. for area in CHINA_AREA:
  32. if addr.find(area)!=-1:
  33. return True
  34. return False
  35. def XpathPraser(self,response,parser):
  36. '''
  37. 针对xpath方式进行解析
  38. :param response:
  39. :param parser:
  40. :return:
  41. '''
  42. # print response
  43. proxylist=[]
  44. root = etree.HTML(response)
  45. proxys = root.xpath(parser['pattern'])
  46. # print proxys
  47. for proxy in proxys:
  48. # print parser['postion']['ip']
  49. try:
  50. ip = proxy.xpath(parser['position']['ip'])[0].text
  51. port = proxy.xpath(parser['position']['port'])[0].text
  52. # print(ip,port)
  53. # print proxys
  54. # type = proxy.xpath(parser['postion']['type'])[0].text
  55. # # print ip,port,type
  56. # if type.find(u'高匿')!=-1:
  57. # type = 0
  58. # else:
  59. # type = 1
  60. # protocol=''
  61. # if len(parser['postion']['protocol']) > 0:
  62. # protocol = proxy.xpath(parser['postion']['protocol'])[0].text
  63. # if protocol.lower().find('https')!=-1:
  64. # protocol = 1
  65. # else:
  66. # protocol = 0
  67. # else:
  68. type=0
  69. protocol = 0
  70. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  71. country = ''
  72. area = ''
  73. if addr.find('省')!=-1 or self.AuthCountry(addr):
  74. country = '中国'
  75. area = addr
  76. else:
  77. country = addr
  78. area = ''
  79. except Exception as e:
  80. continue
  81. # updatetime = datetime.datetime.now()
  82. # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
  83. # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
  84. proxy ={'ip':ip,'port':int(port),'types':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
  85. proxylist.append(proxy)
  86. return proxylist
  87. def RegularPraser(self,response,parser):
  88. '''
  89. 针对正则表达式进行解析
  90. :param response:
  91. :param parser:
  92. :return:
  93. '''
  94. proxylist=[]
  95. pattern = re.compile(parser['pattern'])
  96. matchs = pattern.findall(response)
  97. if matchs !=None:
  98. for match in matchs:
  99. try:
  100. ip = match[parser['position']['ip']]
  101. port = match[parser['position']['port']]
  102. #网站的类型一直不靠谱所以还是默认,之后会检测
  103. type =0
  104. # if parser['postion']['protocol'] > 0:
  105. # protocol = match[parser['postion']['protocol']]
  106. # if protocol.lower().find('https')!=-1:
  107. # protocol = 1
  108. # else:
  109. # protocol = 0
  110. # else:
  111. protocol = 0
  112. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  113. country = ''
  114. area = ''
  115. if addr.find('省')!=-1 or self.AuthCountry(addr):
  116. country = '中国'
  117. area = addr
  118. else:
  119. country = addr
  120. area = ''
  121. except Exception as e:
  122. continue
  123. proxy ={'ip':ip,'port':port,'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
  124. proxylist.append(proxy)
  125. return proxylist
  126. def CnproxyPraser(self,response,parser):
  127. proxylist = self.RegularPraser(response,parser)
  128. chardict ={'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1'}
  129. for proxy in proxylist:
  130. port = proxy['port']
  131. new_port = ''
  132. for i in range(len(port)):
  133. if port[i]!='+':
  134. new_port += chardict[port[i]]
  135. new_port = int(new_port)
  136. proxy['port'] =new_port
  137. return proxylist
  138. def proxy_listPraser(self,response,parser):
  139. proxylist=[]
  140. pattern = re.compile(parser['pattern'])
  141. matchs = pattern.findall(response)
  142. if matchs:
  143. for match in matchs:
  144. try:
  145. ip_port = base64.b64decode(match.replace("Proxy('","").replace("')",""))
  146. ip = ip_port.split(':')[0]
  147. port = ip_port.split(':')[1]
  148. type =0
  149. protocol = 0
  150. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  151. country = ''
  152. area = ''
  153. if addr.find('省')!=-1 or self.AuthCountry(addr):
  154. country = '中国'
  155. area = addr
  156. else:
  157. country = addr
  158. area = ''
  159. except Exception as e:
  160. continue
  161. proxy ={'ip':ip,'port':int(port),'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
  162. proxylist.append(proxy)
  163. return proxylist