#!/usr/bin/env python3 # -*- encoding: utf-8 -*- from __future__ import print_function import sys from os import walk import os import json import codecs from bs4 import BeautifulSoup from config import ignored_title from config import domain_mapping from config import mirror_site_folder # /data/websites 目录下面有各个博客站点的目录: # csrd.aliapp.com, www.taobaotest.com, www.tbdata.org def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def get_immediate_subdirectories(a_dir): return [(os.path.join(a_dir, name),name) for name in os.listdir(a_dir) if os.path.isdir(os.path.join(a_dir, name))] encodings = ['UTF-8', 'GBK', '8859-1', ] def links(dirpath, filename, domain_url): file = os.path.join(dirpath, filename) for encoding in encodings: try: with codecs.open(file, 'r', encoding=encoding) as input: soup = BeautifulSoup(input.read(),"html.parser") for tag in soup.find_all('a',href=True): if not tag.string or tag.string in ignored_title or '#' in tag.get('href'): continue if tag.string and (tag.get('href').startswith(domain_url) or tag.get('href').startswith('/')): link = tag.get('href') if tag.get('href').startswith(domain_url) else (domain_url + tag.get('href')) ## remove multiple line in title: yield (link, tag.string.strip('\r\n\t ').replace('"', "")) except Exception as e: eprint("# error processing file: {} for encoding: {}".format(file, encoding)) else: break ## get the correct file encoding def generate_links_title(directory, domain_url): links_title_dict = {} for (dirpath, dirnames, filenames) in walk(directory): for filename in filenames: #print("{} {} {}".format(dirpath, dirnames, filename)) if filename.endswith("html") or filename.endswith("htm"): for (link, title) in links(dirpath, filename, domain_url): links_title_dict[link] = title return links_title_dict if __name__ == "__main__": print("urls = {") for (directory, domain_name) in get_immediate_subdirectories(mirror_site_folder): domain_url = domain_mapping[domain_name] dict = generate_links_title(directory, domain_url) for (key, val) in dict.items(): print('"{}" : "{}",'.format(key, val)) print("}") # pretty print json: #print("urls = ", end='', flush=True) #print(json.dumps(full_links_title_dict, indent=4, sort_keys=True, ensure_ascii=False))