百度贴吧帖子备份
起因
2020年8月,我为了缓解压力,开始养月季。入坑后,我关注了一位介绍月季品种的博主。他的帖子图文并茂、内容详实且文采斐然。他推荐的品种大多抗病性强,与苗商大力宣传的娇弱品种对比鲜明。因此他所处的舆论环境多不平静,明嘲暗讽时而有之。2020年10月,时任某吧吧务的苗商设局陷害他,意欲删除那些介绍贴。我出于保护资料的目的,对他的绝大部分帖子进行了备份,并将其转换为本地文件。在这一过程中,我发现网上很少有详细说明如何备份贴吧帖子的文章,于是准备自己动手写一篇,以便后人之需。
步骤
在此简要概述我所做的工作。我手工整理了需要备份的帖子链接,然后使用A的代码生成html文件,使用B的代码批量下载帖子中的图片。随后,核对图片与html是否一一对应,将html中图片和贴吧表情的url改为本地路径,并清洗数据中的噪音。网友S使用印象笔记保存了该博主的部分帖子。我将他的文件与我保存的文件进行核对,整理出已备份帖子合集。
- 生成html文件
使用hjhee的tiebaSpider代码。由于网络原因,源代码的dependency可能需要手动从官网下载并解压至指定目录。
- 下载帖子图片
使用zhaohui8969的tiebaImageGet代码。原代码默认每次只下载一个链接中的图片。我对其进行了一些修改,以实现单次下载多个链接中的图片。
def main():
#usr_name = "relu"
#txt_name = "urls.txt"
txt_path = './backup//urls//202101//urls.txt'
with open(txt_path, "rb") as file:
lines = file.readlines()
lines = [x.strip() for x in lines]
# item in lines: https://tieba.baidu.com/p/6100954692
pids = []
for item in range(len(lines)):
url = lines[item]
pid = url[-10:]
pids.append(int(pid))
print(u"\nData has been processed")
max_thread_num = 20
save_directory = './backup//202101//img'
try:
image_get_obj = ImageGet(max_thread_num, save_directory)
for id in range(len(pids)):
print(u'\n开始下载')
image_get_obj(pids[id])
print(u'\n休眠5秒钟')
time.sleep(5)
print(u'\n已下载当前文档链接中的图片。请更换文档名称和IP地址')
except:
print(u'\n出了一些问题, 你可以自己去main()里的try块改改自己看看bug\n')
- 核对文件完整性
由于前两个步骤使用的代码不会输出错误日志,我需要检查url/html文件/图片三者之间是否一一对应。代码如下。
import codecs
from os import listdir
from os.path import isfile, join
def get_htmlPid(html_folders_path, html_file_name):
# html_file_name = title + ".html"(with length of 5)
title_len = len(html_file_name) - 5
# 447: plain marks in html file before pid in urls
# length of file name is not included
begin = 447 + (2 * title_len)
end = begin + 10
html_file_path = html_folders_path + "//" + html_file_name
with open(html_file_path, 'r', encoding='utf-8') as HtmlFile:
html_source_code = HtmlFile.read()
html_pid = int(html_source_code[begin: end])
return html_pid
def get_imgPid(img_folders_path):
# get all folder names
img_pid = listdir(img_folders_path)
img_pid_int = []
for id in range(len(img_pid)):
img_pid_int.append(int(img_pid[id]))
return img_pid_int
def get_urlPid(url_path):
url_pid = []
with open(url_path, "r") as load_url_file:
plain_urls = load_url_file.readlines()
plain_urls = [x.strip() for x in plain_urls]
for url_id in range(len(plain_urls)):
single_url = plain_urls[url_id]
url_pid.append(int(single_url[-10:]))
return url_pid
def check_integrity(url_pid, html_pid, img_pid):
# remove duplicates
final_url_pid = list(set(url_pid))
final_html_pid = list(set(html_pid))
final_img_pid = list(set(img_pid))
missing_html = []
missing_img = []
# check html files
for url_item in range(len(final_url_pid)):
if final_url_pid[url_item] in final_html_pid:
pass
else:
missing_html.append(final_url_pid[url_item])
if final_url_pid[url_item] in final_img_pid:
pass
else:
missing_img.append(final_url_pid[url_item])
return missing_html, missing_img
def main():
usr_name = "relu"
base_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//" + usr_name
store_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//z-missing-files"
folders = listdir(base_path)
html_pid = []
# store missing_html and missing_img
all_missing_html_pid = []
all_missing_img_pid = []
for folder_id in range(len(folders)):
# initialize paths
html_path = base_path + "//" + folders[folder_id]
img_path = base_path + "//" + folders[folder_id] + "//img"
url_path = base_path + "//" + folders[folder_id] + "//urls.txt"
# store html names
html_file_names = []
# get all html file names in a folder
file_names = listdir(html_path)
for name in file_names:
if name.endswith(".html"):
html_file_names.append(name)
for html_name in range(len(html_file_names)):
html_pid_single = get_htmlPid(html_path, html_file_names[html_name])
html_pid.append(html_pid_single)
img_pid = get_imgPid(img_path)
url_pid = get_urlPid(url_path)
missing_html_pid, missing_img_pid = check_integrity(url_pid, html_pid, img_pid)
all_missing_html_pid.extend(missing_html_pid)
all_missing_img_pid.extend(missing_img_pid)
store_html_path = store_path + "//" + usr_name + "-missing-html.txt"
store_img_path = store_path + "//" + usr_name + "-missing-img.txt"
with open(store_html_path, "w", encoding="utf-8") as store_html:
for html in range(len(all_missing_html_pid)):
complete_url_1 = "https://tieba.baidu.com/p/" + str(all_missing_html_pid[html])
store_html.write("%s\n" % complete_url_1)
with open(store_img_path, "w", encoding="utf-8") as store_img:
for img in range(len(all_missing_img_pid)):
complete_url_2 = "https://tieba.baidu.com/p/" + str(all_missing_img_pid[img])
store_img.write("%s\n" % complete_url_2)
print("\n Data integrity of %s has been checked." % usr_name)
if __name__ == "__main__":
main()
- 修改图片路径
Html文件中的图片url指向百度图床,需要将其修改为本地路径。
from bs4 import BeautifulSoup
from os.path import basename, splitext
from os import listdir
import re
def modify_src(folder_path, file_name):
file_path = folder_path + '//' + file_name
soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")
# pid_link = soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))
# t = soup.select('a[href^="https://tieba.baidu.com/p/"]')
# below is correct
url = [elm.get_text() for elm in soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))]
# get pid
pid = url[0][-10:]
# modify image src
# unmodified src: https://imgsa.baidu.com/forum/w%3D580/sign=4d3033fbbdde9c82a665f9875c8080d2/4417d558ccbf6c815f62fb2ab23eb13532fa4035.jpg
# modified: ./img/6233150605/09d6a94bd11373f0a6c6bb5daa0f4bfbf9ed0488.jpg
# pattern: ./img/pid/img_name
# img_name: img["src"][-44:]
# unmodified emoticon src :https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon72.png
# modified: ../emoticon/image_emoticon72.png
for img in soup.findAll('img',{"src":True}):
if img["src"].endswith(".jpg"):
modified = './img/' + pid + '/' + img['src'][-44:]
img['src'] = modified
if img['src'].endswith('.png'):
splited = img['src'].split('/')
emoticon_name = splited[-1]
emoti_modified = '../tieba_emoticon/' + emoticon_name
img['src'] = emoti_modified
with open(file_path, "w", encoding = "utf-8") as file:
file.write(str(soup))
def main():
base_path = './rose_tieba_data_processing//data//tiezi_downloaded'
#file_name = "鹅黄美人 Buff Beauty.html"
#file_path = base_path + "//" + file_name
folder_names = listdir(base_path)
for folder_item in range(len(folder_names)):
if folder_names[folder_item] == 'tieba_emoticon':
pass
else:
print('Processing files in %s' % folder_names[folder_item])
folder_path = base_path + '//' + folder_names[folder_item]
all_files = listdir(folder_path)
# get all html files in a folder
file_name = []
for item in range(len(all_files)):
if all_files[item].endswith('.html'):
file_name.append(all_files[item])
# processing html files
for file_id in range(len(file_name)):
modify_src(folder_path, file_name[file_id])
print('%s has been processed' % file_name[file_id])
file_name.clear()
if __name__ == "__main__":
main()
- 清洗噪音
Html文件中的标题包含“【图片】”“XX吧”内容,需要将其清除。
def modify_title(folder_path, file_name):
file_path = folder_path + '//' + file_name
soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")
new_title = str(soup.find('title').string)
print(new_title)
new_title = new_title.replace('【图片】', '')
new_title = new_title.replace('【月季花吧】_百度贴吧', '')
new_title = new_title.replace('【天狼月季吧】_百度贴吧', '')
soup.title.string = new_title
new_h1 = str(soup.find('h1').string)
new_h1 = new_h1.replace('【图片】', '')
new_h1 = new_h1.replace('【月季花吧】_百度贴吧', '')
new_h1 = new_h1.replace('【天狼月季吧】_百度贴吧', '')
soup.h1.string = new_h1
with open(file_path, "w", encoding = "utf-8") as file:
file.write(str(soup))
另外,帖子中“希望各位吧友能支持魔吧月刊。”也需要清除:
def remove_noise(folder_path, file_name):
file_path = folder_path + '//' + file_name
soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")
for div in soup.find_all("img", {'class':'nicknameEmoji'}):
div.decompose()
noise = '<div>\n<div>\n<div> #3: <b></b></div>\n<div>希望各位吧友能支持魔吧月刊。</div>\n</div>\n<hr/>\n</div>'
cleaned = str(soup).replace(noise, '')
with open(file_path, "w", encoding = "utf-8") as file:
file.write(cleaned)
- 整理合集
我采用核对文件标题的方式寻找我和S的备份文件之间的差异。由于印象笔记生成的文件名十分混乱,我使用了正则表达式对其进行清洗。
import os
from os import listdir
from os.path import isfile, join
import re
# collect spider data
spider_path = "./tieba-download//html-only"
spider_original_names = []
spider_names = []
spider_folders = listdir(spider_path)
for spider_folder_id in range(len(spider_folders)):
spider_sub_path = spider_path + "//" + spider_folders[spider_folder_id]
spider_files = listdir(spider_sub_path)
spider_original_names.extend(spider_files)
# remove unnecessary suffix
for spider_item in range(len(spider_original_names)):
spider_names.append(spider_original_names[spider_item].replace("【月季花吧】_百度贴吧", ""))
# remove duplicate names in spider_data
spider_names = list(set(spider_names))
# collect evernote data
evernote_path = "G://ddd-data-evernote"
evernote_original_names = []
evernote_names = []
for file in os.listdir(evernote_path):
if file.endswith(".html"):
evernote_original_names.append(file)
# compile regex expression
pattern_string = r"【月季花吧】_\w{1,4}\s\[\d{1}\]|【月季花吧】_\w{1,4}|_\w{4}_\w{1,4}\s\[\d{1}\]|_\w{4}_\w{0,4}|【月季花吧】"
pattern = re.compile(pattern_string)
# remove unnecessary suffix
for item in range(len(evernote_original_names)):
evernote_names.append(pattern.sub("", evernote_original_names[item]))
# remove duplicate names in spider_data
evernote_names = list(set(evernote_names))
# double check files
spider_minus_evernote = []
evernote_minus_spider = []
for evernote_id in range(len(evernote_names)):
if evernote_names[evernote_id] in spider_names:
pass
else:
evernote_minus_spider.append(evernote_names[evernote_id])
for spider_id in range(len(spider_names)):
if spider_names[spider_id] in evernote_names:
pass
else:
spider_minus_evernote.append(spider_names[spider_id])
# set basic paths
evernote_store_path = "./evernote_minus_spider.txt"
spider_store_path = "./spider_minus_evernote.txt"
# store data which is in evernote but not in spider
with open(evernote_store_path, "w", encoding='utf-8') as evernote_save:
for evernote_save_item in evernote_minus_spider:
evernote_save.write("%s\n" % evernote_save_item)
# store data which is not in evernote but in spider
with open(spider_store_path, "w", encoding='utf-8') as spider_save:
for spider_save_item in spider_minus_evernote:
spider_save.write("%s\n" % spider_save_item)
print("Missing files in evernote and spider have been checked.")
- 生成目录
我按帖子的发表日期对其排序,生成了一份目录。
import pickle
all_temp_data = pickle.load( open( "ordered_temp_data.p", "rb" ) )
# data structure:
# [year, month, day, title, category, path]
# e.g. [2018, 10, 14, '巴黎七月的粉龙沙', '品种介绍-梅昂 (Meilland)', './品种介绍-梅昂 (Meilland)//巴黎七月的粉龙沙.html']
hrefs = []
# href :
# <p> 10月14日 <a href="./品种介绍-梅昂 (Meilland)//巴黎七月的粉龙沙.html">巴黎七月的粉龙沙</a></p>
for item in range(len(all_temp_data)):
href = '<p> ' + str(all_temp_data[item][1]) + '月' + str(all_temp_data[item][2]) + '日 ' + '<a href=\"' + all_temp_data[item][5] + "\">" + all_temp_data[item][3] + '</a></p>'
hrefs.append(href)
save_path = 'G://rose_tieba_data_processing//codes//href-three.txt'
with open(save_path, "w", encoding="utf-8") as store_hrefs:
for href_id in range(len(hrefs)):
store_hrefs.write("%s\n" % hrefs[href_id])