Browse Source

First try

main
Noëlle 2 years ago
parent
commit
f2a8884c9b
No known key found for this signature in database
1 changed files with 29 additions and 114 deletions
  1. 29
    114
      main.py

+ 29
- 114
main.py View File

import requests, time, re, os, configparser, sys, argparse
import requests, time, re, os, configparser, sys, argparse, shutil
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

css_to_change = ["https://www.somethingawful.com/css/main.css?12",
"https://forums.somethingawful.com/css/bbcode.css?1456974408",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
"https://www.somethingawful.com/css/globalmenu.css",
"https://www.somethingawful.com/css/forums.css?1545838155"
]
css_to_change_to = ["main.css",
"bbcode.css",
"jquery-ui.min.css",
"globalmenu.css",
"forums.css"
]

scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
"https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
"https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
"https://twemoji.maxcdn.com/2/twemoji.min.js",
]
scripts_to_change_to = ["jquery.min.js",
"jquery-migrate.min.js",
"jquery-ui.min.js",
"forums-combined.js",
"twemoji.min.js"
]



def main(args): def main(args):
print(f"Fetching from thread {args.thread}.")
if not os.path.isdir("archive"):
print(f"Fetching PDFs.")
if not os.path.isdir("pdfs"):
print("First-time setup...") print("First-time setup...")
os.mkdir("archive")
if not os.path.isdir("archive/css"):
print("Setting up CSS...")
os.mkdir("archive/css")
for f in range(len(css_to_change)):
r = requests.get(css_to_change[f])
with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
file.write(r.text)
if not os.path.isdir("archive/scripts"):
print("Setting up scripts...")
os.mkdir("archive/scripts")
for f in range(len(scripts_to_change)):
r = requests.get(scripts_to_change[f])
with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
file.write(r.text)
os.mkdir("pdfs")
if not os.path.isdir(f"archive/{args.thread}"):
print(f"Creating directory for {args.thread}...")
os.mkdir(f"archive/{args.thread}")
if not os.path.isdir(f"archive/{args.thread}/images"):
print(f"Creating directory for {args.thread}/images...")
os.mkdir(f"archive/{args.thread}/images")
config = configparser.ConfigParser(interpolation=None) config = configparser.ConfigParser(interpolation=None)
if not os.path.isfile('config.ini'): if not os.path.isfile('config.ini'):
print("config.ini is missing!") print("config.ini is missing!")
print("username and password must be present in config.ini.") print("username and password must be present in config.ini.")
sys.exit(0) sys.exit(0)


INFOVAYA_ROOT = "https://events.infovaya.com/"

info = { "username": config["DEFAULT"]["username"], info = { "username": config["DEFAULT"]["username"],
"password": config["DEFAULT"]["password"], "password": config["DEFAULT"]["password"],
"action": "login" "action": "login"
} }


s = requests.Session() s = requests.Session()
q = s.post("https://forums.somethingawful.com/account.php", data=info)

if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
else:
lastpage = 1

i = lastpage
parse_ok = True
while True:
q = s.post("{}login".format(INFOVAYA_ROOT), data=info)
q.raise_for_status()

sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
sessions_request.raise_for_status()
sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
print("Starting to search for PDFs. This may take a while...")
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
time.sleep(0.05) time.sleep(0.05)
payload = {'threadid': args.thread, 'pagenumber': str(i)}
r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
if "Specified thread was not found in the live forums." in r.text:
print("That thread does not exist or is not accessible to you.")
parse_ok = False
break
if "The page number you requested" in r.text:
i -= 1
break
print(f"Fetching page {i} in thread {args.thread}.")
with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup.find_all("link",{"href":True}):
if tag["href"] in css_to_change:
tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
for tag in soup.find_all("script",{"src":True}):
if tag["src"] in scripts_to_change:
tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
for tag in soup.find_all("a",{"title":True}):
if tag["title"] == "Next page":
tag["href"] = f"page{i+1}.html"
if tag["title"] == "Previous page":
tag["href"] = f"page{i-1}.html"
if args.images:
for tag in soup.find_all("img",{"src":True}):
src = tag["src"]
if src[:4] != "http":
src = "https:" + src
imgname = src.split("/")[-1]
fullpath = f"archive/{args.thread}/images/{imgname}"
if os.path.isfile(fullpath):
tag["src"] = f"images/{imgname}"
else:
img = s.get(src, stream=True)
if img.status_code == 200:
try:
theimage = Image.open(BytesIO(img.content))
print(f"\tSaving {fullpath}.")
theimage.save(fullpath)
tag["src"] = f"images/{imgname}"
except:
print(f"\tImage {src} not available.")
else:
print(f"\tImage {src} not available.")
file.write(soup.prettify())
i += 1

page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"]))
page_request.raise_for_status()
page_soup = BeautifulSoup(page_request.text, "html.parser")
page_title = page.soup.h1.string
for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")):
pdf_name = pdf_tag["href"].split("/")[-1]
file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name)
pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True)
pdf_request.raise_for_status()
print("Fetching PDF for '{}'...".format(page_title))
with open(file_name, "wb") as file:
pdf_request.raw.decode_content = True
shutil.copyfileobj(pdf_request.raw, file)
print("Saved as pdfs/{}".format(file_name))
print("Finished fetching thread.") print("Finished fetching thread.")


config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
with open("config.ini", "w") as file:
config.write(file)

if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("thread", action="store", help="The threadid from the thread's URL") parser.add_argument("thread", action="store", help="The threadid from the thread's URL")

Loading…
Cancel
Save