|
|
@@ -1,61 +1,12 @@ |
|
|
|
import requests, time, re, os, configparser, sys, argparse |
|
|
|
import requests, time, re, os, configparser, sys, argparse, shutil |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
from PIL import Image |
|
|
|
from io import BytesIO |
|
|
|
|
|
|
|
css_to_change = ["https://www.somethingawful.com/css/main.css?12", |
|
|
|
"https://forums.somethingawful.com/css/bbcode.css?1456974408", |
|
|
|
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css", |
|
|
|
"https://www.somethingawful.com/css/globalmenu.css", |
|
|
|
"https://www.somethingawful.com/css/forums.css?1545838155" |
|
|
|
] |
|
|
|
css_to_change_to = ["main.css", |
|
|
|
"bbcode.css", |
|
|
|
"jquery-ui.min.css", |
|
|
|
"globalmenu.css", |
|
|
|
"forums.css" |
|
|
|
] |
|
|
|
|
|
|
|
scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js", |
|
|
|
"https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js", |
|
|
|
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js", |
|
|
|
"https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227", |
|
|
|
"https://twemoji.maxcdn.com/2/twemoji.min.js", |
|
|
|
] |
|
|
|
scripts_to_change_to = ["jquery.min.js", |
|
|
|
"jquery-migrate.min.js", |
|
|
|
"jquery-ui.min.js", |
|
|
|
"forums-combined.js", |
|
|
|
"twemoji.min.js" |
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
def main(args): |
|
|
|
print(f"Fetching from thread {args.thread}.") |
|
|
|
if not os.path.isdir("archive"): |
|
|
|
print(f"Fetching PDFs.") |
|
|
|
if not os.path.isdir("pdfs"): |
|
|
|
print("First-time setup...") |
|
|
|
os.mkdir("archive") |
|
|
|
if not os.path.isdir("archive/css"): |
|
|
|
print("Setting up CSS...") |
|
|
|
os.mkdir("archive/css") |
|
|
|
for f in range(len(css_to_change)): |
|
|
|
r = requests.get(css_to_change[f]) |
|
|
|
with open(f"archive/css/{css_to_change_to[f]}", "w+") as file: |
|
|
|
file.write(r.text) |
|
|
|
if not os.path.isdir("archive/scripts"): |
|
|
|
print("Setting up scripts...") |
|
|
|
os.mkdir("archive/scripts") |
|
|
|
for f in range(len(scripts_to_change)): |
|
|
|
r = requests.get(scripts_to_change[f]) |
|
|
|
with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file: |
|
|
|
file.write(r.text) |
|
|
|
os.mkdir("pdfs") |
|
|
|
|
|
|
|
if not os.path.isdir(f"archive/{args.thread}"): |
|
|
|
print(f"Creating directory for {args.thread}...") |
|
|
|
os.mkdir(f"archive/{args.thread}") |
|
|
|
if not os.path.isdir(f"archive/{args.thread}/images"): |
|
|
|
print(f"Creating directory for {args.thread}/images...") |
|
|
|
os.mkdir(f"archive/{args.thread}/images") |
|
|
|
config = configparser.ConfigParser(interpolation=None) |
|
|
|
if not os.path.isfile('config.ini'): |
|
|
|
print("config.ini is missing!") |
|
|
@@ -66,76 +17,40 @@ def main(args): |
|
|
|
print("username and password must be present in config.ini.") |
|
|
|
sys.exit(0) |
|
|
|
|
|
|
|
INFOVAYA_ROOT = "https://events.infovaya.com/" |
|
|
|
|
|
|
|
info = { "username": config["DEFAULT"]["username"], |
|
|
|
"password": config["DEFAULT"]["password"], |
|
|
|
"action": "login" |
|
|
|
} |
|
|
|
|
|
|
|
s = requests.Session() |
|
|
|
q = s.post("https://forums.somethingawful.com/account.php", data=info) |
|
|
|
|
|
|
|
if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "": |
|
|
|
lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"]) |
|
|
|
else: |
|
|
|
lastpage = 1 |
|
|
|
|
|
|
|
i = lastpage |
|
|
|
parse_ok = True |
|
|
|
while True: |
|
|
|
q = s.post("{}login".format(INFOVAYA_ROOT), data=info) |
|
|
|
q.raise_for_status() |
|
|
|
|
|
|
|
sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT)) |
|
|
|
sessions_request.raise_for_status() |
|
|
|
sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser") |
|
|
|
print("Starting to search for PDFs. This may take a while...") |
|
|
|
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): |
|
|
|
time.sleep(0.05) |
|
|
|
payload = {'threadid': args.thread, 'pagenumber': str(i)} |
|
|
|
r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) |
|
|
|
if "Specified thread was not found in the live forums." in r.text: |
|
|
|
print("That thread does not exist or is not accessible to you.") |
|
|
|
parse_ok = False |
|
|
|
break |
|
|
|
if "The page number you requested" in r.text: |
|
|
|
i -= 1 |
|
|
|
break |
|
|
|
print(f"Fetching page {i} in thread {args.thread}.") |
|
|
|
with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file: |
|
|
|
soup = BeautifulSoup(r.text, "html.parser") |
|
|
|
for tag in soup.find_all("link",{"href":True}): |
|
|
|
if tag["href"] in css_to_change: |
|
|
|
tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])] |
|
|
|
for tag in soup.find_all("script",{"src":True}): |
|
|
|
if tag["src"] in scripts_to_change: |
|
|
|
tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])] |
|
|
|
for tag in soup.find_all("a",{"title":True}): |
|
|
|
if tag["title"] == "Next page": |
|
|
|
tag["href"] = f"page{i+1}.html" |
|
|
|
if tag["title"] == "Previous page": |
|
|
|
tag["href"] = f"page{i-1}.html" |
|
|
|
if args.images: |
|
|
|
for tag in soup.find_all("img",{"src":True}): |
|
|
|
src = tag["src"] |
|
|
|
if src[:4] != "http": |
|
|
|
src = "https:" + src |
|
|
|
imgname = src.split("/")[-1] |
|
|
|
fullpath = f"archive/{args.thread}/images/{imgname}" |
|
|
|
if os.path.isfile(fullpath): |
|
|
|
tag["src"] = f"images/{imgname}" |
|
|
|
else: |
|
|
|
img = s.get(src, stream=True) |
|
|
|
if img.status_code == 200: |
|
|
|
try: |
|
|
|
theimage = Image.open(BytesIO(img.content)) |
|
|
|
print(f"\tSaving {fullpath}.") |
|
|
|
theimage.save(fullpath) |
|
|
|
tag["src"] = f"images/{imgname}" |
|
|
|
except: |
|
|
|
print(f"\tImage {src} not available.") |
|
|
|
else: |
|
|
|
print(f"\tImage {src} not available.") |
|
|
|
file.write(soup.prettify()) |
|
|
|
i += 1 |
|
|
|
|
|
|
|
page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"])) |
|
|
|
page_request.raise_for_status() |
|
|
|
page_soup = BeautifulSoup(page_request.text, "html.parser") |
|
|
|
page_title = page.soup.h1.string |
|
|
|
for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")): |
|
|
|
pdf_name = pdf_tag["href"].split("/")[-1] |
|
|
|
file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name) |
|
|
|
pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True) |
|
|
|
pdf_request.raise_for_status() |
|
|
|
print("Fetching PDF for '{}'...".format(page_title)) |
|
|
|
with open(file_name, "wb") as file: |
|
|
|
pdf_request.raw.decode_content = True |
|
|
|
shutil.copyfileobj(pdf_request.raw, file) |
|
|
|
print("Saved as pdfs/{}".format(file_name)) |
|
|
|
|
|
|
|
print("Finished fetching thread.") |
|
|
|
|
|
|
|
config["DEFAULT"][f"lastpage{args.thread}"] = str(i) |
|
|
|
with open("config.ini", "w") as file: |
|
|
|
config.write(file) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument("thread", action="store", help="The threadid from the thread's URL") |