links_title_dumper.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. #!/usr/bin/env python3
  2. # -*- encoding: utf-8 -*-
  3. from __future__ import print_function
  4. import sys
  5. from os import walk
  6. import os
  7. import json
  8. import codecs
  9. from bs4 import BeautifulSoup
  10. from config import ignored_title
  11. from config import domain_mapping
  12. from config import mirror_site_folder
  13. # /data/websites 目录下面有各个博客站点的目录:
  14. # csrd.aliapp.com, www.taobaotest.com, www.tbdata.org
  15. def eprint(*args, **kwargs):
  16. print(*args, file=sys.stderr, **kwargs)
  17. def get_immediate_subdirectories(a_dir):
  18. return [(os.path.join(a_dir, name),name) for name in os.listdir(a_dir)
  19. if os.path.isdir(os.path.join(a_dir, name))]
  20. encodings = ['UTF-8', 'GBK', '8859-1', ]
  21. def links(dirpath, filename, domain_url):
  22. file = os.path.join(dirpath, filename)
  23. for encoding in encodings:
  24. try:
  25. with codecs.open(file, 'r', encoding=encoding) as input:
  26. soup = BeautifulSoup(input.read(),"html.parser")
  27. for tag in soup.find_all('a',href=True):
  28. if not tag.string or tag.string in ignored_title or '#' in tag.get('href'):
  29. continue
  30. if tag.string and (tag.get('href').startswith(domain_url) or tag.get('href').startswith('/')):
  31. link = tag.get('href') if tag.get('href').startswith(domain_url) else (domain_url + tag.get('href'))
  32. ## remove multiple line in title:
  33. yield (link, tag.string.strip('\r\n\t ').replace('"', ""))
  34. except Exception as e:
  35. eprint("# error processing file: {} for encoding: {}".format(file, encoding))
  36. else:
  37. break ## get the correct file encoding
  38. def generate_links_title(directory, domain_url):
  39. links_title_dict = {}
  40. for (dirpath, dirnames, filenames) in walk(directory):
  41. for filename in filenames:
  42. #print("{} {} {}".format(dirpath, dirnames, filename))
  43. if filename.endswith("html") or filename.endswith("htm"):
  44. for (link, title) in links(dirpath, filename, domain_url):
  45. links_title_dict[link] = title
  46. return links_title_dict
  47. if __name__ == "__main__":
  48. print("urls = {")
  49. for (directory, domain_name) in get_immediate_subdirectories(mirror_site_folder):
  50. domain_url = domain_mapping[domain_name]
  51. dict = generate_links_title(directory, domain_url)
  52. for (key, val) in dict.items():
  53. print('"{}" : "{}",'.format(key, val))
  54. print("}")
  55. # pretty print json:
  56. #print("urls = ", end='', flush=True)
  57. #print(json.dumps(full_links_title_dict, indent=4, sort_keys=True, ensure_ascii=False))