HtmlPraser.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # coding:utf-8
  2. import base64
  3. from config import QQWRY_PATH, CHINA_AREA
  4. from util.IPAddress import IPAddresss
  5. import re
  6. from util.compatibility import text_
  7. __author__ = 'qiye'
  8. from lxml import etree
  9. class Html_Parser(object):
  10. def __init__(self):
  11. self.ips = IPAddresss(QQWRY_PATH)
  12. def parse(self, response, parser):
  13. '''
  14. :param response: 响应
  15. :param type: 解析方式
  16. :return:
  17. '''
  18. if parser['type'] == 'xpath':
  19. return self.XpathPraser(response, parser)
  20. elif parser['type'] == 'regular':
  21. return self.RegularPraser(response, parser)
  22. elif parser['type'] == 'module':
  23. return getattr(self, parser['moduleName'], None)(response, parser)
  24. else:
  25. return None
  26. def AuthCountry(self, addr):
  27. '''
  28. 用来判断地址是哪个国家的
  29. :param addr:
  30. :return:
  31. '''
  32. for area in CHINA_AREA:
  33. if text_(area) in addr:
  34. return True
  35. return False
  36. def XpathPraser(self, response, parser):
  37. '''
  38. 针对xpath方式进行解析
  39. :param response:
  40. :param parser:
  41. :return:
  42. '''
  43. proxylist = []
  44. root = etree.HTML(response)
  45. proxys = root.xpath(parser['pattern'])
  46. for proxy in proxys:
  47. try:
  48. ip = proxy.xpath(parser['position']['ip'])[0].text
  49. port = proxy.xpath(parser['position']['port'])[0].text
  50. type = 0
  51. protocol = 0
  52. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  53. country = text_('')
  54. area = text_('')
  55. if text_('省') in addr or self.AuthCountry(addr):
  56. country = text_('国内')
  57. area = addr
  58. else:
  59. country = text_('国外')
  60. area = addr
  61. except Exception as e:
  62. continue
  63. # updatetime = datetime.datetime.now()
  64. # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
  65. # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
  66. proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
  67. 'area': area, 'speed': 100}
  68. proxylist.append(proxy)
  69. return proxylist
  70. def RegularPraser(self, response, parser):
  71. '''
  72. 针对正则表达式进行解析
  73. :param response:
  74. :param parser:
  75. :return:
  76. '''
  77. proxylist = []
  78. pattern = re.compile(parser['pattern'])
  79. matchs = pattern.findall(response)
  80. if matchs != None:
  81. for match in matchs:
  82. try:
  83. ip = match[parser['position']['ip']]
  84. port = match[parser['position']['port']]
  85. # 网站的类型一直不靠谱所以还是默认,之后会检测
  86. type = 0
  87. # if parser['postion']['protocol'] > 0:
  88. # protocol = match[parser['postion']['protocol']]
  89. # if protocol.lower().find('https')!=-1:
  90. # protocol = 1
  91. # else:
  92. # protocol = 0
  93. # else:
  94. protocol = 0
  95. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  96. country = text_('')
  97. area = text_('')
  98. # print(ip,port)
  99. if text_('省') in addr or self.AuthCountry(addr):
  100. country = text_('国内')
  101. area = addr
  102. else:
  103. country = text_('国外')
  104. area = addr
  105. except Exception as e:
  106. continue
  107. proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area,
  108. 'speed': 100}
  109. proxylist.append(proxy)
  110. return proxylist
  111. def CnproxyPraser(self, response, parser):
  112. proxylist = self.RegularPraser(response, parser)
  113. chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'}
  114. for proxy in proxylist:
  115. port = proxy['port']
  116. new_port = ''
  117. for i in range(len(port)):
  118. if port[i] != '+':
  119. new_port += chardict[port[i]]
  120. new_port = int(new_port)
  121. proxy['port'] = new_port
  122. return proxylist
  123. def proxy_listPraser(self, response, parser):
  124. proxylist = []
  125. pattern = re.compile(parser['pattern'])
  126. matchs = pattern.findall(response)
  127. if matchs:
  128. for match in matchs:
  129. try:
  130. ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
  131. ip = ip_port.split(':')[0]
  132. port = ip_port.split(':')[1]
  133. type = 0
  134. protocol = 0
  135. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  136. country = text_('')
  137. area = text_('')
  138. # print(ip,port)
  139. if text_('省') in addr or self.AuthCountry(addr):
  140. country = text_('国内')
  141. area = addr
  142. else:
  143. country = text_('国外')
  144. area = addr
  145. except Exception as e:
  146. continue
  147. proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
  148. 'area': area, 'speed': 100}
  149. proxylist.append(proxy)
  150. return proxylist