Noëlle pirms 2 gadiem
vecāks
revīzija
f2a8884c9b
Šim parakstam datu bāzē netika atrasta zināma atslēga
1 mainītis faili ar 29 papildinājumiem un 114 dzēšanām
  1. 29
    114
      main.py

+ 29
- 114
main.py Parādīt failu

@@ -1,61 +1,12 @@
import requests, time, re, os, configparser, sys, argparse
import requests, time, re, os, configparser, sys, argparse, shutil
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

css_to_change = ["https://www.somethingawful.com/css/main.css?12",
"https://forums.somethingawful.com/css/bbcode.css?1456974408",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
"https://www.somethingawful.com/css/globalmenu.css",
"https://www.somethingawful.com/css/forums.css?1545838155"
]
css_to_change_to = ["main.css",
"bbcode.css",
"jquery-ui.min.css",
"globalmenu.css",
"forums.css"
]

scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
"https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
"https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
"https://twemoji.maxcdn.com/2/twemoji.min.js",
]
scripts_to_change_to = ["jquery.min.js",
"jquery-migrate.min.js",
"jquery-ui.min.js",
"forums-combined.js",
"twemoji.min.js"
]


def main(args):
print(f"Fetching from thread {args.thread}.")
if not os.path.isdir("archive"):
print(f"Fetching PDFs.")
if not os.path.isdir("pdfs"):
print("First-time setup...")
os.mkdir("archive")
if not os.path.isdir("archive/css"):
print("Setting up CSS...")
os.mkdir("archive/css")
for f in range(len(css_to_change)):
r = requests.get(css_to_change[f])
with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
file.write(r.text)
if not os.path.isdir("archive/scripts"):
print("Setting up scripts...")
os.mkdir("archive/scripts")
for f in range(len(scripts_to_change)):
r = requests.get(scripts_to_change[f])
with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
file.write(r.text)
os.mkdir("pdfs")
if not os.path.isdir(f"archive/{args.thread}"):
print(f"Creating directory for {args.thread}...")
os.mkdir(f"archive/{args.thread}")
if not os.path.isdir(f"archive/{args.thread}/images"):
print(f"Creating directory for {args.thread}/images...")
os.mkdir(f"archive/{args.thread}/images")
config = configparser.ConfigParser(interpolation=None)
if not os.path.isfile('config.ini'):
print("config.ini is missing!")
@@ -66,76 +17,40 @@ def main(args):
print("username and password must be present in config.ini.")
sys.exit(0)

INFOVAYA_ROOT = "https://events.infovaya.com/"

info = { "username": config["DEFAULT"]["username"],
"password": config["DEFAULT"]["password"],
"action": "login"
}

s = requests.Session()
q = s.post("https://forums.somethingawful.com/account.php", data=info)

if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
else:
lastpage = 1

i = lastpage
parse_ok = True
while True:
q = s.post("{}login".format(INFOVAYA_ROOT), data=info)
q.raise_for_status()

sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
sessions_request.raise_for_status()
sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
print("Starting to search for PDFs. This may take a while...")
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
time.sleep(0.05)
payload = {'threadid': args.thread, 'pagenumber': str(i)}
r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
if "Specified thread was not found in the live forums." in r.text:
print("That thread does not exist or is not accessible to you.")
parse_ok = False
break
if "The page number you requested" in r.text:
i -= 1
break
print(f"Fetching page {i} in thread {args.thread}.")
with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup.find_all("link",{"href":True}):
if tag["href"] in css_to_change:
tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
for tag in soup.find_all("script",{"src":True}):
if tag["src"] in scripts_to_change:
tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
for tag in soup.find_all("a",{"title":True}):
if tag["title"] == "Next page":
tag["href"] = f"page{i+1}.html"
if tag["title"] == "Previous page":
tag["href"] = f"page{i-1}.html"
if args.images:
for tag in soup.find_all("img",{"src":True}):
src = tag["src"]
if src[:4] != "http":
src = "https:" + src
imgname = src.split("/")[-1]
fullpath = f"archive/{args.thread}/images/{imgname}"
if os.path.isfile(fullpath):
tag["src"] = f"images/{imgname}"
else:
img = s.get(src, stream=True)
if img.status_code == 200:
try:
theimage = Image.open(BytesIO(img.content))
print(f"\tSaving {fullpath}.")
theimage.save(fullpath)
tag["src"] = f"images/{imgname}"
except:
print(f"\tImage {src} not available.")
else:
print(f"\tImage {src} not available.")
file.write(soup.prettify())
i += 1

page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"]))
page_request.raise_for_status()
page_soup = BeautifulSoup(page_request.text, "html.parser")
page_title = page.soup.h1.string
for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")):
pdf_name = pdf_tag["href"].split("/")[-1]
file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name)
pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True)
pdf_request.raise_for_status()
print("Fetching PDF for '{}'...".format(page_title))
with open(file_name, "wb") as file:
pdf_request.raw.decode_content = True
shutil.copyfileobj(pdf_request.raw, file)
print("Saved as pdfs/{}".format(file_name))
print("Finished fetching thread.")

config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
with open("config.ini", "w") as file:
config.write(file)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("thread", action="store", help="The threadid from the thread's URL")

Notiek ielāde…
Atcelt
Saglabāt