1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 |
- #!/usr/bin/env python3
- # -*- encoding: utf-8 -*-
- from __future__ import print_function
- import sys
- from os import walk
- import os
- import json
- import codecs
- from bs4 import BeautifulSoup
- from config import ignored_title
- from config import domain_mapping
- from config import mirror_site_folder
- # /data/websites 目录下面有各个博客站点的目录:
- # csrd.aliapp.com, www.taobaotest.com, www.tbdata.org
- def eprint(*args, **kwargs):
- print(*args, file=sys.stderr, **kwargs)
- def get_immediate_subdirectories(a_dir):
- return [(os.path.join(a_dir, name),name) for name in os.listdir(a_dir)
- if os.path.isdir(os.path.join(a_dir, name))]
- encodings = ['UTF-8', 'GBK', '8859-1', ]
- def links(dirpath, filename, domain_url):
- file = os.path.join(dirpath, filename)
- for encoding in encodings:
- try:
- with codecs.open(file, 'r', encoding=encoding) as input:
- soup = BeautifulSoup(input.read(),"html.parser")
- for tag in soup.find_all('a',href=True):
- if not tag.string or tag.string in ignored_title or '#' in tag.get('href'):
- continue
- if tag.string and (tag.get('href').startswith(domain_url) or tag.get('href').startswith('/')):
- link = tag.get('href') if tag.get('href').startswith(domain_url) else (domain_url + tag.get('href'))
- ## remove multiple line in title:
- yield (link, tag.string.strip('\r\n\t ').replace('"', ""))
- except Exception as e:
- eprint("# error processing file: {} for encoding: {}".format(file, encoding))
- else:
- break ## get the correct file encoding
- def generate_links_title(directory, domain_url):
- links_title_dict = {}
- for (dirpath, dirnames, filenames) in walk(directory):
- for filename in filenames:
- #print("{} {} {}".format(dirpath, dirnames, filename))
- if filename.endswith("html") or filename.endswith("htm"):
- for (link, title) in links(dirpath, filename, domain_url):
- links_title_dict[link] = title
- return links_title_dict
- if __name__ == "__main__":
- print("urls = {")
- for (directory, domain_name) in get_immediate_subdirectories(mirror_site_folder):
- domain_url = domain_mapping[domain_name]
- dict = generate_links_title(directory, domain_url)
- for (key, val) in dict.items():
- print('"{}" : "{}",'.format(key, val))
- print("}")
- # pretty print json:
- #print("urls = ", end='', flush=True)
- #print(json.dumps(full_links_title_dict, indent=4, sort_keys=True, ensure_ascii=False))
|