123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- import requests, time, re, os, configparser, sys, argparse
- from bs4 import BeautifulSoup
- from PIL import Image
- from io import BytesIO
-
- css_to_change = ["https://www.somethingawful.com/css/main.css?12",
- "https://forums.somethingawful.com/css/bbcode.css?1456974408",
- "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
- "https://www.somethingawful.com/css/globalmenu.css",
- "https://www.somethingawful.com/css/forums.css?1545838155"
- ]
- css_to_change_to = ["main.css",
- "bbcode.css",
- "jquery-ui.min.css",
- "globalmenu.css",
- "forums.css"
- ]
-
- scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
- "https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
- "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
- "https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
- "https://twemoji.maxcdn.com/2/twemoji.min.js",
- ]
- scripts_to_change_to = ["jquery.min.js",
- "jquery-migrate.min.js",
- "jquery-ui.min.js",
- "forums-combined.js",
- "twemoji.min.js"
- ]
-
-
- def main(args):
- print(f"Fetching from thread {args.thread}.")
- if not os.path.isdir("archive"):
- print("First-time setup...")
- os.mkdir("archive")
- if not os.path.isdir("archive/css"):
- print("Setting up CSS...")
- os.mkdir("archive/css")
- for f in range(len(css_to_change)):
- r = requests.get(css_to_change[f])
- with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
- file.write(r.text)
- if not os.path.isdir("archive/scripts"):
- print("Setting up scripts...")
- os.mkdir("archive/scripts")
- for f in range(len(scripts_to_change)):
- r = requests.get(scripts_to_change[f])
- with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
- file.write(r.text)
-
- if not os.path.isdir(f"archive/{args.thread}"):
- print(f"Creating directory for {args.thread}...")
- os.mkdir(f"archive/{args.thread}")
- if not os.path.isdir(f"archive/{args.thread}/images"):
- print(f"Creating directory for {args.thread}/images...")
- os.mkdir(f"archive/{args.thread}/images")
- config = configparser.ConfigParser(interpolation=None)
- if not os.path.isfile('config.ini'):
- print("config.ini is missing!")
- sys.exit(0)
- config.read('config.ini')
-
- if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
- print("username and password must be present in config.ini.")
- sys.exit(0)
-
- info = { "username": config["DEFAULT"]["username"],
- "password": config["DEFAULT"]["password"],
- "action": "login"
- }
-
- s = requests.Session()
- q = s.post("https://forums.somethingawful.com/account.php", data=info)
-
- if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
- lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
- else:
- lastpage = 1
-
- i = lastpage
- parse_ok = True
- while True:
- time.sleep(0.05)
- payload = {'threadid': args.thread, 'pagenumber': str(i)}
- r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
- if "Specified thread was not found in the live forums." in r.text:
- print("That thread does not exist or is not accessible to you.")
- parse_ok = False
- break
- if "The page number you requested" in r.text:
- i -= 1
- break
- print(f"Fetching page {i} in thread {args.thread}.")
- with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
- soup = BeautifulSoup(r.text, "html.parser")
- for tag in soup.find_all("link",{"href":True}):
- if tag["href"] in css_to_change:
- tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
- for tag in soup.find_all("script",{"src":True}):
- if tag["src"] in scripts_to_change:
- tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
- for tag in soup.find_all("a",{"title":True}):
- if tag["title"] == "Next page":
- tag["href"] = f"page{i+1}.html"
- if tag["title"] == "Previous page":
- tag["href"] = f"page{i-1}.html"
- if args.images:
- for tag in soup.find_all("img",{"src":True}):
- src = tag["src"]
- if src[:4] != "http":
- src = "https:" + src
- imgname = src.split("/")[-1]
- fullpath = f"archive/{args.thread}/images/{imgname}"
- if os.path.isfile(fullpath):
- tag["src"] = f"images/{imgname}"
- else:
- img = s.get(src, stream=True)
- if img.status_code == 200:
- try:
- theimage = Image.open(BytesIO(img.content))
- print(f"\tSaving {fullpath}.")
- theimage.save(fullpath)
- tag["src"] = f"images/{imgname}"
- except:
- print(f"\tImage {src} not available.")
- else:
- print(f"\tImage {src} not available.")
- file.write(soup.prettify())
- i += 1
-
- print("Finished fetching thread.")
-
- config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
- with open("config.ini", "w") as file:
- config.write(file)
-
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
- parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
- args = parser.parse_args()
- main(args)
|