| import requests, time, re, os, configparser, sys, argparse | |||||
| import requests, time, re, os, configparser, sys, argparse, shutil | |||||
| from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
| from PIL import Image | |||||
| from io import BytesIO | |||||
| css_to_change = ["https://www.somethingawful.com/css/main.css?12", | |||||
| "https://forums.somethingawful.com/css/bbcode.css?1456974408", | |||||
| "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css", | |||||
| "https://www.somethingawful.com/css/globalmenu.css", | |||||
| "https://www.somethingawful.com/css/forums.css?1545838155" | |||||
| ] | |||||
| css_to_change_to = ["main.css", | |||||
| "bbcode.css", | |||||
| "jquery-ui.min.css", | |||||
| "globalmenu.css", | |||||
| "forums.css" | |||||
| ] | |||||
| scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js", | |||||
| "https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js", | |||||
| "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js", | |||||
| "https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227", | |||||
| "https://twemoji.maxcdn.com/2/twemoji.min.js", | |||||
| ] | |||||
| scripts_to_change_to = ["jquery.min.js", | |||||
| "jquery-migrate.min.js", | |||||
| "jquery-ui.min.js", | |||||
| "forums-combined.js", | |||||
| "twemoji.min.js" | |||||
| ] | |||||
| def main(args): | def main(args): | ||||
| print(f"Fetching from thread {args.thread}.") | |||||
| if not os.path.isdir("archive"): | |||||
| print(f"Fetching PDFs.") | |||||
| if not os.path.isdir("pdfs"): | |||||
| print("First-time setup...") | print("First-time setup...") | ||||
| os.mkdir("archive") | |||||
| if not os.path.isdir("archive/css"): | |||||
| print("Setting up CSS...") | |||||
| os.mkdir("archive/css") | |||||
| for f in range(len(css_to_change)): | |||||
| r = requests.get(css_to_change[f]) | |||||
| with open(f"archive/css/{css_to_change_to[f]}", "w+") as file: | |||||
| file.write(r.text) | |||||
| if not os.path.isdir("archive/scripts"): | |||||
| print("Setting up scripts...") | |||||
| os.mkdir("archive/scripts") | |||||
| for f in range(len(scripts_to_change)): | |||||
| r = requests.get(scripts_to_change[f]) | |||||
| with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file: | |||||
| file.write(r.text) | |||||
| os.mkdir("pdfs") | |||||
| if not os.path.isdir(f"archive/{args.thread}"): | |||||
| print(f"Creating directory for {args.thread}...") | |||||
| os.mkdir(f"archive/{args.thread}") | |||||
| if not os.path.isdir(f"archive/{args.thread}/images"): | |||||
| print(f"Creating directory for {args.thread}/images...") | |||||
| os.mkdir(f"archive/{args.thread}/images") | |||||
| config = configparser.ConfigParser(interpolation=None) | config = configparser.ConfigParser(interpolation=None) | ||||
| if not os.path.isfile('config.ini'): | if not os.path.isfile('config.ini'): | ||||
| print("config.ini is missing!") | print("config.ini is missing!") | ||||
| print("username and password must be present in config.ini.") | print("username and password must be present in config.ini.") | ||||
| sys.exit(0) | sys.exit(0) | ||||
| INFOVAYA_ROOT = "https://events.infovaya.com/" | |||||
| info = { "username": config["DEFAULT"]["username"], | info = { "username": config["DEFAULT"]["username"], | ||||
| "password": config["DEFAULT"]["password"], | "password": config["DEFAULT"]["password"], | ||||
| "action": "login" | "action": "login" | ||||
| } | } | ||||
| s = requests.Session() | s = requests.Session() | ||||
| q = s.post("https://forums.somethingawful.com/account.php", data=info) | |||||
| if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "": | |||||
| lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"]) | |||||
| else: | |||||
| lastpage = 1 | |||||
| i = lastpage | |||||
| parse_ok = True | |||||
| while True: | |||||
| q = s.post("{}login".format(INFOVAYA_ROOT), data=info) | |||||
| q.raise_for_status() | |||||
| sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT)) | |||||
| sessions_request.raise_for_status() | |||||
| sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser") | |||||
| print("Starting to search for PDFs. This may take a while...") | |||||
| for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): | |||||
| time.sleep(0.05) | time.sleep(0.05) | ||||
| payload = {'threadid': args.thread, 'pagenumber': str(i)} | |||||
| r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) | |||||
| if "Specified thread was not found in the live forums." in r.text: | |||||
| print("That thread does not exist or is not accessible to you.") | |||||
| parse_ok = False | |||||
| break | |||||
| if "The page number you requested" in r.text: | |||||
| i -= 1 | |||||
| break | |||||
| print(f"Fetching page {i} in thread {args.thread}.") | |||||
| with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file: | |||||
| soup = BeautifulSoup(r.text, "html.parser") | |||||
| for tag in soup.find_all("link",{"href":True}): | |||||
| if tag["href"] in css_to_change: | |||||
| tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])] | |||||
| for tag in soup.find_all("script",{"src":True}): | |||||
| if tag["src"] in scripts_to_change: | |||||
| tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])] | |||||
| for tag in soup.find_all("a",{"title":True}): | |||||
| if tag["title"] == "Next page": | |||||
| tag["href"] = f"page{i+1}.html" | |||||
| if tag["title"] == "Previous page": | |||||
| tag["href"] = f"page{i-1}.html" | |||||
| if args.images: | |||||
| for tag in soup.find_all("img",{"src":True}): | |||||
| src = tag["src"] | |||||
| if src[:4] != "http": | |||||
| src = "https:" + src | |||||
| imgname = src.split("/")[-1] | |||||
| fullpath = f"archive/{args.thread}/images/{imgname}" | |||||
| if os.path.isfile(fullpath): | |||||
| tag["src"] = f"images/{imgname}" | |||||
| else: | |||||
| img = s.get(src, stream=True) | |||||
| if img.status_code == 200: | |||||
| try: | |||||
| theimage = Image.open(BytesIO(img.content)) | |||||
| print(f"\tSaving {fullpath}.") | |||||
| theimage.save(fullpath) | |||||
| tag["src"] = f"images/{imgname}" | |||||
| except: | |||||
| print(f"\tImage {src} not available.") | |||||
| else: | |||||
| print(f"\tImage {src} not available.") | |||||
| file.write(soup.prettify()) | |||||
| i += 1 | |||||
| page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"])) | |||||
| page_request.raise_for_status() | |||||
| page_soup = BeautifulSoup(page_request.text, "html.parser") | |||||
| page_title = page.soup.h1.string | |||||
| for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")): | |||||
| pdf_name = pdf_tag["href"].split("/")[-1] | |||||
| file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name) | |||||
| pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True) | |||||
| pdf_request.raise_for_status() | |||||
| print("Fetching PDF for '{}'...".format(page_title)) | |||||
| with open(file_name, "wb") as file: | |||||
| pdf_request.raw.decode_content = True | |||||
| shutil.copyfileobj(pdf_request.raw, file) | |||||
| print("Saved as pdfs/{}".format(file_name)) | |||||
| print("Finished fetching thread.") | print("Finished fetching thread.") | ||||
| config["DEFAULT"][f"lastpage{args.thread}"] = str(i) | |||||
| with open("config.ini", "w") as file: | |||||
| config.write(file) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| parser = argparse.ArgumentParser() | parser = argparse.ArgumentParser() | ||||
| parser.add_argument("thread", action="store", help="The threadid from the thread's URL") | parser.add_argument("thread", action="store", help="The threadid from the thread's URL") |