百度贴吧帖子备份

起因

2020年8月,我为了缓解压力,开始养月季。入坑后,我关注了一位介绍月季品种的博主。他的帖子图文并茂、内容详实且文采斐然。他推荐的品种大多抗病性强,与苗商大力宣传的娇弱品种对比鲜明。因此他所处的舆论环境多不平静,明嘲暗讽时而有之。2020年10月,时任某吧吧务的苗商设局陷害他,意欲删除那些介绍贴。我出于保护资料的目的,对他的绝大部分帖子进行了备份,并将其转换为本地文件。在这一过程中,我发现网上很少有详细说明如何备份贴吧帖子的文章,于是准备自己动手写一篇,以便后人之需。

步骤

在此简要概述我所做的工作。我手工整理了需要备份的帖子链接,然后使用A的代码生成html文件,使用B的代码批量下载帖子中的图片。随后,核对图片与html是否一一对应,将html中图片和贴吧表情的url改为本地路径,并清洗数据中的噪音。网友S使用印象笔记保存了该博主的部分帖子。我将他的文件与我保存的文件进行核对,整理出已备份帖子合集。

  1. 生成html文件

使用hjhee的tiebaSpider代码。由于网络原因,源代码的dependency可能需要手动从官网下载并解压至指定目录。

  1. 下载帖子图片

使用zhaohui8969的tiebaImageGet代码。原代码默认每次只下载一个链接中的图片。我对其进行了一些修改,以实现单次下载多个链接中的图片。

def main():
    #usr_name = "relu"
    #txt_name = "urls.txt"
    txt_path = './backup//urls//202101//urls.txt'
    with open(txt_path, "rb") as file:
        lines = file.readlines()
        lines = [x.strip() for x in lines] 

    # item in lines: https://tieba.baidu.com/p/6100954692
    pids = []
    for item in range(len(lines)):
        url = lines[item]
        pid = url[-10:]
        pids.append(int(pid))

    print(u"\nData has been processed")

    max_thread_num = 20
    save_directory = './backup//202101//img'


    try:
        image_get_obj = ImageGet(max_thread_num, save_directory)
        for id in range(len(pids)):
            print(u'\n开始下载')
            image_get_obj(pids[id])
            print(u'\n休眠5秒钟')
            time.sleep(5)


        print(u'\n已下载当前文档链接中的图片。请更换文档名称和IP地址')
    except:
        print(u'\n出了一些问题, 你可以自己去main()里的try块改改自己看看bug\n')
  1. 核对文件完整性

由于前两个步骤使用的代码不会输出错误日志,我需要检查url/html文件/图片三者之间是否一一对应。代码如下。

import codecs
from os import listdir
from os.path import isfile, join


def get_htmlPid(html_folders_path, html_file_name):
    # html_file_name = title + ".html"(with length of 5)
    title_len = len(html_file_name) - 5

    # 447: plain marks in html file before pid in urls
    # length of file name is not included
    begin = 447 + (2 * title_len)
    end = begin + 10
    html_file_path = html_folders_path + "//" + html_file_name

    with open(html_file_path, 'r', encoding='utf-8') as HtmlFile:
        html_source_code = HtmlFile.read()
        html_pid = int(html_source_code[begin: end])

    return html_pid


def get_imgPid(img_folders_path):

    # get all folder names
    img_pid = listdir(img_folders_path)

    img_pid_int = []
    for id in range(len(img_pid)):
        img_pid_int.append(int(img_pid[id]))

    return img_pid_int


def get_urlPid(url_path):
    url_pid = []
    with open(url_path, "r") as load_url_file:
        plain_urls = load_url_file.readlines()
        plain_urls = [x.strip() for x in plain_urls]
    for url_id in range(len(plain_urls)):
        single_url = plain_urls[url_id]
        url_pid.append(int(single_url[-10:]))
    return url_pid


def check_integrity(url_pid, html_pid, img_pid):
    # remove duplicates
    final_url_pid = list(set(url_pid))
    final_html_pid = list(set(html_pid))
    final_img_pid = list(set(img_pid))

    missing_html = []
    missing_img = []


    # check html files
    for url_item in range(len(final_url_pid)):
        if final_url_pid[url_item] in final_html_pid:
            pass
        else:
            missing_html.append(final_url_pid[url_item])
        if final_url_pid[url_item] in final_img_pid:
            pass
        else:
            missing_img.append(final_url_pid[url_item])

    return missing_html, missing_img

def main():
    usr_name = "relu"
    base_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//" + usr_name
    store_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//z-missing-files"
    folders = listdir(base_path)

    html_pid = []

    # store missing_html and missing_img
    all_missing_html_pid = []
    all_missing_img_pid = []

    for folder_id in range(len(folders)):
        # initialize paths
        html_path = base_path + "//" + folders[folder_id]
        img_path = base_path + "//" + folders[folder_id] + "//img"
        url_path = base_path + "//" + folders[folder_id] + "//urls.txt"

        # store html names
        html_file_names = []

        # get all html file names in a folder
        file_names = listdir(html_path)
        for name in file_names:
            if name.endswith(".html"):
                html_file_names.append(name)

        for html_name in range(len(html_file_names)):
            html_pid_single = get_htmlPid(html_path, html_file_names[html_name])
            html_pid.append(html_pid_single)
        img_pid = get_imgPid(img_path)
        url_pid = get_urlPid(url_path)

        missing_html_pid, missing_img_pid = check_integrity(url_pid, html_pid, img_pid)
        all_missing_html_pid.extend(missing_html_pid)
        all_missing_img_pid.extend(missing_img_pid)

    store_html_path = store_path + "//" + usr_name + "-missing-html.txt"
    store_img_path = store_path + "//" + usr_name + "-missing-img.txt"
    with open(store_html_path, "w", encoding="utf-8") as store_html:
        for html in range(len(all_missing_html_pid)):
            complete_url_1 = "https://tieba.baidu.com/p/" + str(all_missing_html_pid[html])
            store_html.write("%s\n" % complete_url_1)

    with open(store_img_path, "w", encoding="utf-8") as store_img:
        for img in range(len(all_missing_img_pid)):
            complete_url_2 = "https://tieba.baidu.com/p/" + str(all_missing_img_pid[img])
            store_img.write("%s\n" % complete_url_2)

    print("\n Data integrity of %s has been checked." % usr_name)

if __name__ == "__main__":
    main()
  1. 修改图片路径

Html文件中的图片url指向百度图床,需要将其修改为本地路径。

from bs4 import BeautifulSoup
from os.path import basename, splitext
from os import listdir
import re

def modify_src(folder_path, file_name):
    file_path = folder_path + '//' + file_name

    soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")

    # pid_link = soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))
    # t = soup.select('a[href^="https://tieba.baidu.com/p/"]')

    # below is correct
    url = [elm.get_text() for elm in soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))]

    # get pid
    pid = url[0][-10:]

    # modify image src
    # unmodified src: https://imgsa.baidu.com/forum/w%3D580/sign=4d3033fbbdde9c82a665f9875c8080d2/4417d558ccbf6c815f62fb2ab23eb13532fa4035.jpg
    # modified: ./img/6233150605/09d6a94bd11373f0a6c6bb5daa0f4bfbf9ed0488.jpg
    # pattern: ./img/pid/img_name
    # img_name: img["src"][-44:]
    # unmodified emoticon src :https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon72.png
    # modified: ../emoticon/image_emoticon72.png
    for img in soup.findAll('img',{"src":True}):
        if img["src"].endswith(".jpg"):
            modified = './img/' + pid + '/' + img['src'][-44:]
            img['src'] = modified
        if img['src'].endswith('.png'):
            splited = img['src'].split('/')
            emoticon_name = splited[-1]
            emoti_modified = '../tieba_emoticon/' + emoticon_name
            img['src'] = emoti_modified

    with open(file_path, "w", encoding = "utf-8") as file:
        file.write(str(soup))

def main():
    base_path = './rose_tieba_data_processing//data//tiezi_downloaded'
    #file_name = "鹅黄美人 Buff Beauty.html"
    #file_path = base_path + "//" + file_name
    folder_names = listdir(base_path)
    for folder_item in range(len(folder_names)):
        if folder_names[folder_item] == 'tieba_emoticon':
            pass
        else:
            print('Processing files in %s' % folder_names[folder_item])
            folder_path = base_path + '//' + folder_names[folder_item]
            all_files = listdir(folder_path)

            # get all html files in a folder
            file_name = []
            for item in range(len(all_files)):
                if all_files[item].endswith('.html'):
                    file_name.append(all_files[item])

            # processing html files
            for file_id in range(len(file_name)):
                modify_src(folder_path, file_name[file_id])
                print('%s has been processed' % file_name[file_id])
            file_name.clear()

if __name__ == "__main__":
    main()        
  1. 清洗噪音

Html文件中的标题包含“【图片】”“XX吧”内容,需要将其清除。

def modify_title(folder_path, file_name):
    file_path = folder_path + '//' + file_name

    soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")

    new_title = str(soup.find('title').string)
    print(new_title)
    new_title = new_title.replace('【图片】', '')
    new_title = new_title.replace('【月季花吧】_百度贴吧', '')
    new_title = new_title.replace('【天狼月季吧】_百度贴吧', '')

    soup.title.string = new_title

    new_h1 = str(soup.find('h1').string)
    new_h1 = new_h1.replace('【图片】', '')
    new_h1 = new_h1.replace('【月季花吧】_百度贴吧', '')
    new_h1 = new_h1.replace('【天狼月季吧】_百度贴吧', '')

    soup.h1.string = new_h1

    with open(file_path, "w", encoding = "utf-8") as file:
        file.write(str(soup))

另外,帖子中“希望各位吧友能支持魔吧月刊。”也需要清除:

def remove_noise(folder_path, file_name):
    file_path = folder_path + '//' + file_name
    soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")
    for div in soup.find_all("img", {'class':'nicknameEmoji'}): 
        div.decompose()
    noise = '<div>\n<div>\n<div> #3: <b></b></div>\n<div>希望各位吧友能支持魔吧月刊。</div>\n</div>\n<hr/>\n</div>'
    cleaned = str(soup).replace(noise, '')
    with open(file_path, "w", encoding = "utf-8") as file:
        file.write(cleaned)
  1. 整理合集

我采用核对文件标题的方式寻找我和S的备份文件之间的差异。由于印象笔记生成的文件名十分混乱,我使用了正则表达式对其进行清洗。

import os
from os import listdir
from os.path import isfile, join
import re

# collect spider data
spider_path = "./tieba-download//html-only"
spider_original_names = []
spider_names = []

spider_folders = listdir(spider_path)
for spider_folder_id in range(len(spider_folders)):
    spider_sub_path = spider_path + "//" + spider_folders[spider_folder_id]
    spider_files = listdir(spider_sub_path)
    spider_original_names.extend(spider_files)

# remove unnecessary suffix
for spider_item in range(len(spider_original_names)):
    spider_names.append(spider_original_names[spider_item].replace("【月季花吧】_百度贴吧", ""))

# remove duplicate names in spider_data
spider_names = list(set(spider_names))



# collect evernote data
evernote_path = "G://ddd-data-evernote"
evernote_original_names = []
evernote_names = []
for file in os.listdir(evernote_path):
    if file.endswith(".html"):
        evernote_original_names.append(file)


# compile regex expression
pattern_string = r"【月季花吧】_\w{1,4}\s\[\d{1}\]|【月季花吧】_\w{1,4}|_\w{4}_\w{1,4}\s\[\d{1}\]|_\w{4}_\w{0,4}|【月季花吧】"
pattern = re.compile(pattern_string)

# remove unnecessary suffix
for item in range(len(evernote_original_names)):
    evernote_names.append(pattern.sub("", evernote_original_names[item]))

# remove duplicate names in spider_data
evernote_names = list(set(evernote_names))


# double check files
spider_minus_evernote = []
evernote_minus_spider = []
for evernote_id in range(len(evernote_names)):
    if evernote_names[evernote_id] in spider_names:
        pass
    else:
        evernote_minus_spider.append(evernote_names[evernote_id])

for spider_id in range(len(spider_names)):
    if spider_names[spider_id] in evernote_names:
        pass
    else:
        spider_minus_evernote.append(spider_names[spider_id])

# set basic paths       
evernote_store_path = "./evernote_minus_spider.txt"
spider_store_path = "./spider_minus_evernote.txt"


# store data which is in evernote but not in spider
with open(evernote_store_path, "w", encoding='utf-8') as evernote_save:
    for evernote_save_item in evernote_minus_spider:
        evernote_save.write("%s\n" % evernote_save_item)

# store data which is not in evernote but in spider
with open(spider_store_path, "w", encoding='utf-8') as spider_save:
    for spider_save_item in spider_minus_evernote:
        spider_save.write("%s\n" % spider_save_item)

print("Missing files in evernote and spider have been checked.")
  1. 生成目录

我按帖子的发表日期对其排序,生成了一份目录。

import pickle

all_temp_data = pickle.load( open( "ordered_temp_data.p", "rb" ) )

# data structure:
# [year, month, day, title, category, path]
# e.g. [2018, 10, 14, '巴黎七月的粉龙沙', '品种介绍-梅昂 (Meilland)', './品种介绍-梅昂 (Meilland)//巴黎七月的粉龙沙.html']

hrefs = []

# href :
# <p> 10月14日 <a href="./品种介绍-梅昂 (Meilland)//巴黎七月的粉龙沙.html">巴黎七月的粉龙沙</a></p>
for item in range(len(all_temp_data)):
    href = '<p> ' + str(all_temp_data[item][1]) + '月' + str(all_temp_data[item][2]) + '日  ' + '<a href=\"' + all_temp_data[item][5] + "\">" + all_temp_data[item][3] + '</a></p>'
    hrefs.append(href)


save_path = 'G://rose_tieba_data_processing//codes//href-three.txt'
with open(save_path, "w", encoding="utf-8") as store_hrefs:
        for href_id in range(len(hrefs)):
            store_hrefs.write("%s\n" % hrefs[href_id])