|
|
@@ -1,11 +1,14 @@ |
|
|
|
import requests, time, re, os, configparser, sys, argparse, shutil |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
def main(args): |
|
|
|
def main(args: dict) -> None: |
|
|
|
print(f"Fetching PDFs.") |
|
|
|
if not os.path.isdir("pdfs"): |
|
|
|
print("First-time setup...") |
|
|
|
print("Setting up PDFs directory.") |
|
|
|
os.mkdir("pdfs") |
|
|
|
if args.save_pages and not os.path.isdir("html"): |
|
|
|
print("Setting up HTML directory.") |
|
|
|
os.mkdir("html") |
|
|
|
|
|
|
|
config = configparser.ConfigParser(interpolation=None) |
|
|
|
if not os.path.isfile('config.ini'): |
|
|
@@ -24,36 +27,64 @@ def main(args): |
|
|
|
"action": "login" |
|
|
|
} |
|
|
|
|
|
|
|
s = requests.Session() |
|
|
|
q = s.post("{}login".format(INFOVAYA_ROOT), data=info) |
|
|
|
q.raise_for_status() |
|
|
|
infovaya_session = requests.Session() |
|
|
|
infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info) |
|
|
|
infovaya_creds_request.raise_for_status() |
|
|
|
|
|
|
|
sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT)) |
|
|
|
sessions_request = infovaya_session.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT)) |
|
|
|
sessions_request.raise_for_status() |
|
|
|
sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser") |
|
|
|
if args.save_pages: |
|
|
|
with open("html/main.html", "w+", encoding="utf-8") as file: |
|
|
|
file.write(sessions_page_soup.prettify()) |
|
|
|
print("Starting to search for PDFs. This may take a while...") |
|
|
|
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): |
|
|
|
time.sleep(0.05) |
|
|
|
page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"])) |
|
|
|
page_request.raise_for_status() |
|
|
|
page_soup = BeautifulSoup(page_request.text, "html.parser") |
|
|
|
page_title = page.soup.h1.string |
|
|
|
for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")): |
|
|
|
pdf_name = pdf_tag["href"].split("/")[-1] |
|
|
|
file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name) |
|
|
|
pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True) |
|
|
|
pdf_request.raise_for_status() |
|
|
|
print("Fetching PDF for '{}'...".format(page_title)) |
|
|
|
with open(file_name, "wb") as file: |
|
|
|
pdf_request.raw.decode_content = True |
|
|
|
shutil.copyfileobj(pdf_request.raw, file) |
|
|
|
print("Saved as pdfs/{}".format(file_name)) |
|
|
|
session_href = session_link["href"][1:] |
|
|
|
page_name = f"session_{session_href.split('=')[-1]}.html" |
|
|
|
if not os.path.isfile(f"html/{page_name}"): |
|
|
|
time.sleep(1) |
|
|
|
print(f"Fetching {session_href} ... ", end="") |
|
|
|
page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}") |
|
|
|
page_request.raise_for_status() |
|
|
|
print("Success! Finding presentations... ", end="") |
|
|
|
page_soup = BeautifulSoup(page_request.text, "html.parser") |
|
|
|
if args.save_pages: |
|
|
|
with open(f"html/{page_name}", "w+", encoding="utf-8") as file: |
|
|
|
file.write(page_soup.prettify()) |
|
|
|
print(f"Wrote to file html/{page_name}.") |
|
|
|
else: |
|
|
|
print() |
|
|
|
# import pdb; pdb.set_trace() |
|
|
|
for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")): |
|
|
|
presentation_href = presentation_link["href"][1:] |
|
|
|
pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html" |
|
|
|
if not os.path.isfile(f"html/{pres_page_name}"): |
|
|
|
print(f" Fetching {presentation_href} ...", end="") |
|
|
|
pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}") |
|
|
|
pres_request.raise_for_status() |
|
|
|
print("Success! ", end="") |
|
|
|
pres_page_soup = BeautifulSoup(pres_request.text, "html.parser") |
|
|
|
if args.save_pages: |
|
|
|
with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file: |
|
|
|
file.write(pres_page_soup.prettify()) |
|
|
|
pres_page_title = pres_page_soup.h1.string |
|
|
|
print(f"Found '{pres_page_title}'. Finding PDF...") |
|
|
|
for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")): |
|
|
|
pdf_name = pdf_tag["href"].split("/")[-1] |
|
|
|
file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}" |
|
|
|
pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}") |
|
|
|
pdf_request.raise_for_status() |
|
|
|
print(f"Fetching PDF for '{pres_page_title}'...") |
|
|
|
with open(f"pdfs/{file_name}", "wb") as file: |
|
|
|
file.write(pdf_request.content) |
|
|
|
print(f"Saved as pdfs/{file_name}") |
|
|
|
time.sleep(2) |
|
|
|
|
|
|
|
print("Finished fetching thread.") |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument("thread", action="store", help="The threadid from the thread's URL") |
|
|
|
parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!") |
|
|
|
parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of ") |
|
|
|
# parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!") |
|
|
|
args = parser.parse_args() |
|
|
|
print(args) |
|
|
|
main(args) |