HtmlPraser.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. #coding:utf-8
  2. import base64
  3. import datetime
  4. from config import QQWRY_PATH, CHINA_AREA
  5. from util.IPAddress import IPAddresss
  6. import re
  7. __author__ = 'qiye'
  8. from lxml import etree
  9. class Html_Parser(object):
  10. def __init__(self):
  11. self.ips = IPAddresss(QQWRY_PATH)
  12. def parse(self,response,parser):
  13. '''
  14. :param response: 响应
  15. :param type: 解析方式
  16. :return:
  17. '''
  18. if parser['type']=='xpath':
  19. return self.XpathPraser(response,parser)
  20. elif parser['type']=='regular':
  21. return self.RegularPraser(response,parser)
  22. elif parser['type']=='module':
  23. return getattr(self,parser['moduleName'],None)(response,parser)
  24. else:
  25. return None
  26. def AuthCountry(self,addr):
  27. '''
  28. 用来判断地址是哪个国家的
  29. :param addr:
  30. :return:
  31. '''
  32. for area in CHINA_AREA:
  33. if addr.find(area)!=-1:
  34. return True
  35. return False
  36. def XpathPraser(self,response,parser):
  37. '''
  38. 针对xpath方式进行解析
  39. :param response:
  40. :param parser:
  41. :return:
  42. '''
  43. # print response
  44. proxylist=[]
  45. root = etree.HTML(response)
  46. proxys = root.xpath(parser['pattern'])
  47. # print proxys
  48. for proxy in proxys:
  49. # print parser['postion']['ip']
  50. try:
  51. ip = proxy.xpath(parser['position']['ip'])[0].text
  52. port = proxy.xpath(parser['position']['port'])[0].text
  53. # type = proxy.xpath(parser['postion']['type'])[0].text
  54. # # print ip,port,type
  55. # if type.find(u'高匿')!=-1:
  56. # type = 0
  57. # else:
  58. # type = 1
  59. # protocol=''
  60. # if len(parser['postion']['protocol']) > 0:
  61. # protocol = proxy.xpath(parser['postion']['protocol'])[0].text
  62. # if protocol.lower().find('https')!=-1:
  63. # protocol = 1
  64. # else:
  65. # protocol = 0
  66. # else:
  67. type=0
  68. protocol = 0
  69. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  70. country = ''
  71. area = ''
  72. if addr.find(u'省')!=-1 or self.AuthCountry(addr):
  73. country = u'中国'
  74. area = addr
  75. else:
  76. country = addr
  77. area = ''
  78. except Exception,e:
  79. continue
  80. # updatetime = datetime.datetime.now()
  81. # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
  82. # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
  83. proxy ={'ip':ip,'port':int(port),'types':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
  84. proxylist.append(proxy)
  85. return proxylist
  86. def RegularPraser(self,response,parser):
  87. '''
  88. 针对正则表达式进行解析
  89. :param response:
  90. :param parser:
  91. :return:
  92. '''
  93. proxylist=[]
  94. pattern = re.compile(parser['pattern'])
  95. matchs = pattern.findall(response)
  96. if matchs !=None:
  97. for match in matchs:
  98. ip = match[parser['position']['ip']]
  99. port = match[parser['position']['port']]
  100. #网站的类型一直不靠谱所以还是默认,之后会检测
  101. type =0
  102. # if parser['postion']['protocol'] > 0:
  103. # protocol = match[parser['postion']['protocol']]
  104. # if protocol.lower().find('https')!=-1:
  105. # protocol = 1
  106. # else:
  107. # protocol = 0
  108. # else:
  109. protocol = 0
  110. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  111. country = ''
  112. area = ''
  113. if addr.find(u'省')!=-1 or self.AuthCountry(addr):
  114. country = u'中国'
  115. area = addr
  116. else:
  117. country = addr
  118. area = ''
  119. proxy ={'ip':ip,'port':port,'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
  120. proxylist.append(proxy)
  121. return proxylist
  122. def CnproxyPraser(self,response,parser):
  123. proxylist = self.RegularPraser(response,parser)
  124. chardict ={'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1'}
  125. for proxy in proxylist:
  126. port = proxy['port']
  127. new_port = ''
  128. for i in range(len(port)):
  129. if port[i]!='+':
  130. new_port += chardict[port[i]]
  131. new_port = int(new_port)
  132. proxy['port'] =new_port
  133. return proxylist
  134. def proxy_listPraser(self,response,parser):
  135. proxylist=[]
  136. pattern = re.compile(parser['pattern'])
  137. matchs = pattern.findall(response)
  138. if matchs:
  139. for match in matchs:
  140. ip_port = base64.b64decode(match.replace("Proxy('","").replace("')",""))
  141. ip = ip_port.split(':')[0]
  142. port = ip_port.split(':')[1]
  143. type =0
  144. protocol = 0
  145. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  146. country = ''
  147. area = ''
  148. if addr.find(u'省')!=-1 or self.AuthCountry(addr):
  149. country = u'中国'
  150. area = addr
  151. else:
  152. country = addr
  153. area = ''
  154. proxy ={'ip':ip,'port':int(port),'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
  155. proxylist.append(proxy)
  156. return proxylist