浏览代码

First try

main
Noëlle 2 年前
父节点
当前提交
f2a8884c9b
找不到此签名对应的密钥
共有 1 个文件被更改,包括 29 次插入114 次删除
  1. 29
    114
      main.py

+ 29
- 114
main.py 查看文件

import requests, time, re, os, configparser, sys, argparse
import requests, time, re, os, configparser, sys, argparse, shutil
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

css_to_change = ["https://www.somethingawful.com/css/main.css?12",
"https://forums.somethingawful.com/css/bbcode.css?1456974408",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
"https://www.somethingawful.com/css/globalmenu.css",
"https://www.somethingawful.com/css/forums.css?1545838155"
]
css_to_change_to = ["main.css",
"bbcode.css",
"jquery-ui.min.css",
"globalmenu.css",
"forums.css"
]

scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
"https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
"https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
"https://twemoji.maxcdn.com/2/twemoji.min.js",
]
scripts_to_change_to = ["jquery.min.js",
"jquery-migrate.min.js",
"jquery-ui.min.js",
"forums-combined.js",
"twemoji.min.js"
]



def main(args): def main(args):
print(f"Fetching from thread {args.thread}.")
if not os.path.isdir("archive"):
print(f"Fetching PDFs.")
if not os.path.isdir("pdfs"):
print("First-time setup...") print("First-time setup...")
os.mkdir("archive")
if not os.path.isdir("archive/css"):
print("Setting up CSS...")
os.mkdir("archive/css")
for f in range(len(css_to_change)):
r = requests.get(css_to_change[f])
with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
file.write(r.text)
if not os.path.isdir("archive/scripts"):
print("Setting up scripts...")
os.mkdir("archive/scripts")
for f in range(len(scripts_to_change)):
r = requests.get(scripts_to_change[f])
with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
file.write(r.text)
os.mkdir("pdfs")
if not os.path.isdir(f"archive/{args.thread}"):
print(f"Creating directory for {args.thread}...")
os.mkdir(f"archive/{args.thread}")
if not os.path.isdir(f"archive/{args.thread}/images"):
print(f"Creating directory for {args.thread}/images...")
os.mkdir(f"archive/{args.thread}/images")
config = configparser.ConfigParser(interpolation=None) config = configparser.ConfigParser(interpolation=None)
if not os.path.isfile('config.ini'): if not os.path.isfile('config.ini'):
print("config.ini is missing!") print("config.ini is missing!")
print("username and password must be present in config.ini.") print("username and password must be present in config.ini.")
sys.exit(0) sys.exit(0)


INFOVAYA_ROOT = "https://events.infovaya.com/"

info = { "username": config["DEFAULT"]["username"], info = { "username": config["DEFAULT"]["username"],
"password": config["DEFAULT"]["password"], "password": config["DEFAULT"]["password"],
"action": "login" "action": "login"
} }


s = requests.Session() s = requests.Session()
q = s.post("https://forums.somethingawful.com/account.php", data=info)

if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
else:
lastpage = 1

i = lastpage
parse_ok = True
while True:
q = s.post("{}login".format(INFOVAYA_ROOT), data=info)
q.raise_for_status()

sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
sessions_request.raise_for_status()
sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
print("Starting to search for PDFs. This may take a while...")
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
time.sleep(0.05) time.sleep(0.05)
payload = {'threadid': args.thread, 'pagenumber': str(i)}
r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
if "Specified thread was not found in the live forums." in r.text:
print("That thread does not exist or is not accessible to you.")
parse_ok = False
break
if "The page number you requested" in r.text:
i -= 1
break
print(f"Fetching page {i} in thread {args.thread}.")
with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup.find_all("link",{"href":True}):
if tag["href"] in css_to_change:
tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
for tag in soup.find_all("script",{"src":True}):
if tag["src"] in scripts_to_change:
tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
for tag in soup.find_all("a",{"title":True}):
if tag["title"] == "Next page":
tag["href"] = f"page{i+1}.html"
if tag["title"] == "Previous page":
tag["href"] = f"page{i-1}.html"
if args.images:
for tag in soup.find_all("img",{"src":True}):
src = tag["src"]
if src[:4] != "http":
src = "https:" + src
imgname = src.split("/")[-1]
fullpath = f"archive/{args.thread}/images/{imgname}"
if os.path.isfile(fullpath):
tag["src"] = f"images/{imgname}"
else:
img = s.get(src, stream=True)
if img.status_code == 200:
try:
theimage = Image.open(BytesIO(img.content))
print(f"\tSaving {fullpath}.")
theimage.save(fullpath)
tag["src"] = f"images/{imgname}"
except:
print(f"\tImage {src} not available.")
else:
print(f"\tImage {src} not available.")
file.write(soup.prettify())
i += 1

page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"]))
page_request.raise_for_status()
page_soup = BeautifulSoup(page_request.text, "html.parser")
page_title = page.soup.h1.string
for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")):
pdf_name = pdf_tag["href"].split("/")[-1]
file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name)
pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True)
pdf_request.raise_for_status()
print("Fetching PDF for '{}'...".format(page_title))
with open(file_name, "wb") as file:
pdf_request.raw.decode_content = True
shutil.copyfileobj(pdf_request.raw, file)
print("Saved as pdfs/{}".format(file_name))
print("Finished fetching thread.") print("Finished fetching thread.")


config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
with open("config.ini", "w") as file:
config.write(file)

if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("thread", action="store", help="The threadid from the thread's URL") parser.add_argument("thread", action="store", help="The threadid from the thread's URL")

正在加载...
取消
保存