HtmlPraser.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. # coding:utf-8
  2. import base64
  3. import datetime
  4. from config import QQWRY_PATH, CHINA_AREA
  5. from util.IPAddress import IPAddresss
  6. import re
  7. __author__ = 'qiye'
  8. from lxml import etree
  9. class Html_Parser(object):
  10. def __init__(self):
  11. self.ips = IPAddresss(QQWRY_PATH)
  12. def parse(self, response, parser):
  13. '''
  14. :param response: 响应
  15. :param type: 解析方式
  16. :return:
  17. '''
  18. if parser['type'] == 'xpath':
  19. return self.XpathPraser(response, parser)
  20. elif parser['type'] == 'regular':
  21. return self.RegularPraser(response, parser)
  22. elif parser['type'] == 'module':
  23. return getattr(self, parser['moduleName'], None)(response, parser)
  24. else:
  25. return None
  26. def AuthCountry(self, addr):
  27. '''
  28. 用来判断地址是哪个国家的
  29. :param addr:
  30. :return:
  31. '''
  32. for area in CHINA_AREA:
  33. if addr.find(area) != -1:
  34. return True
  35. return False
  36. def XpathPraser(self, response, parser):
  37. '''
  38. 针对xpath方式进行解析
  39. :param response:
  40. :param parser:
  41. :return:
  42. '''
  43. # print response
  44. proxylist = []
  45. root = etree.HTML(response)
  46. proxys = root.xpath(parser['pattern'])
  47. for proxy in proxys:
  48. try:
  49. ip = proxy.xpath(parser['position']['ip'])[0].text
  50. port = proxy.xpath(parser['position']['port'])[0].text
  51. type = 0
  52. protocol = 0
  53. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  54. country = ''
  55. area = ''
  56. if addr.find(u'省') != -1 or self.AuthCountry(addr):
  57. country = u'国内'
  58. area = addr
  59. else:
  60. country = u'国外'
  61. area = addr
  62. except Exception, e:
  63. continue
  64. # updatetime = datetime.datetime.now()
  65. # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
  66. # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
  67. proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
  68. 'area': area, 'speed': 100}
  69. proxylist.append(proxy)
  70. return proxylist
  71. def RegularPraser(self, response, parser):
  72. '''
  73. 针对正则表达式进行解析
  74. :param response:
  75. :param parser:
  76. :return:
  77. '''
  78. proxylist = []
  79. pattern = re.compile(parser['pattern'])
  80. matchs = pattern.findall(response)
  81. if matchs != None:
  82. for match in matchs:
  83. ip = match[parser['position']['ip']]
  84. port = match[parser['position']['port']]
  85. # 网站的类型一直不靠谱所以还是默认,之后会检测
  86. type = 0
  87. # if parser['postion']['protocol'] > 0:
  88. # protocol = match[parser['postion']['protocol']]
  89. # if protocol.lower().find('https')!=-1:
  90. # protocol = 1
  91. # else:
  92. # protocol = 0
  93. # else:
  94. protocol = 0
  95. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  96. country = ''
  97. area = ''
  98. if addr.find(u'省') != -1 or self.AuthCountry(addr):
  99. country = u'中国'
  100. area = addr
  101. else:
  102. country = addr
  103. area = ''
  104. proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area,
  105. 'speed': 100}
  106. proxylist.append(proxy)
  107. return proxylist
  108. def CnproxyPraser(self, response, parser):
  109. proxylist = self.RegularPraser(response, parser)
  110. chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'}
  111. for proxy in proxylist:
  112. port = proxy['port']
  113. new_port = ''
  114. for i in range(len(port)):
  115. if port[i] != '+':
  116. new_port += chardict[port[i]]
  117. new_port = int(new_port)
  118. proxy['port'] = new_port
  119. return proxylist
  120. def proxy_listPraser(self, response, parser):
  121. proxylist = []
  122. pattern = re.compile(parser['pattern'])
  123. matchs = pattern.findall(response)
  124. if matchs:
  125. for match in matchs:
  126. ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
  127. ip = ip_port.split(':')[0]
  128. port = ip_port.split(':')[1]
  129. type = 0
  130. protocol = 0
  131. addr = self.ips.getIpAddr(self.ips.str2ip(ip))
  132. country = ''
  133. area = ''
  134. if addr.find(u'省') != -1 or self.AuthCountry(addr):
  135. country = u'中国'
  136. area = addr
  137. else:
  138. country = addr
  139. area = ''
  140. proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
  141. 'area': area, 'speed': 100}
  142. proxylist.append(proxy)
  143. return proxylist