crawler.js 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. const http = require('http')
  2. const iconv = require('iconv-lite')
  3. const BufferHelper = require('bufferhelper')
  4. /*
  5. * 命名简写备注
  6. *
  7. * 省级(省份,Province) p
  8. * 地级(城市,City) c
  9. * 县级(区县,Area) a
  10. * 乡级(乡镇街道,Street) s
  11. * 村级(村委会居委会,Village) v
  12. */
  13. const pReg = /<td><a href='(.*?).html'>(.*?)<br\/><\/a><\/td>/g
  14. const casReg = /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  15. const vReg = /<td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td>/g
  16. const host = 'www.stats.gov.cn'
  17. const path = '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{route}.html'
  18. /**
  19. * 抓取数据
  20. * @author modood <https://github.com/modood>
  21. * @datetime 2018-01-31 19:23
  22. */
  23. exports.fetch = (host, route, regexp, codeLen) =>
  24. new Promise((resolve, reject) => http.get({
  25. host,
  26. path: path.replace('#{route}', route),
  27. timeout: 3000
  28. }, res => {
  29. const bufferHelper = new BufferHelper()
  30. const statusCode = res.statusCode
  31. if (statusCode !== 200) {
  32. res.resume()
  33. return reject(new Error('Request Failed. Status Code: ' + statusCode))
  34. }
  35. res.on('data', chunk => bufferHelper.concat(chunk))
  36. res.on('end', () => {
  37. const rawData = iconv.decode(bufferHelper.toBuffer(), 'GBK')
  38. const result = {}
  39. let current
  40. while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
  41. return resolve(result)
  42. })
  43. }).on('error', reject).on('timeout', () => reject(new Error('timeout'))))
  44. /**
  45. * 抓取省级数据
  46. * @author modood <https://github.com/modood>
  47. * @datetime 2018-01-31 19:40
  48. */
  49. exports.fetchProvinces = async () => {
  50. try {
  51. return await exports.fetch(host, 'index', pReg, 2)
  52. } catch (err) {
  53. if (err.message !== 'timeout') console.log(`抓取省级数据失败(${err}),正在重试...`)
  54. return exports.fetchProvinces()
  55. }
  56. }
  57. /**
  58. * 抓取地级数据
  59. * @author modood <https://github.com/modood>
  60. * @datetime 2018-01-31 19:51
  61. */
  62. exports.fetchCities = async (pCode) => {
  63. try {
  64. return await exports.fetch(host, pCode, casReg, 4)
  65. } catch (err) {
  66. if (err.message !== 'timeout') console.log(`抓取省级(${pCode})的地级数据失败(${err}),正在重试...`)
  67. return exports.fetchCities(pCode)
  68. }
  69. }
  70. /**
  71. * 抓取县级数据
  72. * @author modood <https://github.com/modood>
  73. * @datetime 2018-01-31 20:03
  74. */
  75. exports.fetchAreas = async (cCode) => {
  76. cCode = cCode.toString()
  77. const pCode = cCode.substr(0, 2)
  78. try {
  79. return await exports.fetch(host, `${pCode}/${cCode}`, casReg, 6)
  80. } catch (err) {
  81. if (err.message !== 'timeout') console.log(`抓取地级(${cCode})的县级数据失败(${err}),正在重试...`)
  82. return exports.fetchAreas(cCode)
  83. }
  84. }
  85. /**
  86. * 抓取乡级数据
  87. * @author modood <https://github.com/modood>
  88. * @datetime 2018-01-31 20:08
  89. */
  90. exports.fetchStreets = async (aCode, route) => {
  91. aCode = aCode.toString()
  92. const pCode = aCode.substr(0, 2)
  93. const cCodeSuffix = aCode.substr(2, 2)
  94. const _route = route || `${pCode}/${cCodeSuffix}/${aCode}`
  95. try {
  96. return await exports.fetch(host, _route, casReg, 9)
  97. } catch (err) {
  98. if (err.message !== 'timeout') console.log(`抓取县级(${aCode})的乡级数据失败(${err}),正在重试...`)
  99. return exports.fetchStreets(aCode, route)
  100. }
  101. }
  102. /**
  103. * 抓取村级数据
  104. * @author modood <https://github.com/modood>
  105. * @datetime 2018-01-31 20:19
  106. */
  107. exports.fetchVillages = async (sCode, route) => {
  108. sCode = sCode.toString()
  109. const pCode = sCode.substr(0, 2)
  110. const cCodeSuffix = sCode.substr(2, 2)
  111. const aCodeSuffix = sCode.substr(4, 2)
  112. const _route = route || `${pCode}/${cCodeSuffix}/${aCodeSuffix}/${sCode}`
  113. try {
  114. return await exports.fetch(host, _route, vReg, 12)
  115. } catch (err) {
  116. if (err.message !== 'timeout') console.log(`抓取乡级(${sCode})的村级数据失败(${err}),正在重试...`)
  117. return exports.fetchVillages(sCode, route)
  118. }
  119. }