import requests, time, re, os, configparser, sys, argparse, shutil from bs4 import BeautifulSoup def main(args: dict) -> None: print(f"Fetching PDFs.") if not os.path.isdir("pdfs"): print("Setting up PDFs directory.") os.mkdir("pdfs") if args.save_pages and not os.path.isdir("html"): print("Setting up HTML directory.") os.mkdir("html") config = configparser.ConfigParser(interpolation=None) if not os.path.isfile('config.ini'): print("config.ini is missing!") sys.exit(0) config.read('config.ini') if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "": print("username and password must be present in config.ini.") sys.exit(0) INFOVAYA_ROOT = "https://events.infovaya.com/" info = { "username": config["DEFAULT"]["username"], "password": config["DEFAULT"]["password"], "action": "login" } infovaya_session = requests.Session() infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info) infovaya_creds_request.raise_for_status() root_event = 88 if not args.event else args.event caching = not args.no_cache cache_list = [] if caching: if not os.path.isfile(f"{root_event}_cache.txt"): cache_file = open(f"{root_event}_cache.txt", "w") else: cache_file = open(f"{root_event}_cache.txt", "a") cache_list = cache_file.readlines() skip_until = False if not args.start_with else True start_with = args.start_with sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions") sessions_request.raise_for_status() sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser") if args.save_pages: with open("html/main.html", "w+", encoding="utf-8") as file: file.write(sessions_page_soup.prettify()) print("Starting to search for PDFs. This may take a while...") for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): session_href = session_link["href"][1:] page_id = session_href.split("=")[-1] page_name = f"session_{page_id}.html" if caching and page_id not in cache_list: cache_file.write(f"{page_id}\n") if skip_until and page_id != start_with: continue skip_until = False if not caching or page_id not in cache_list: time.sleep(1) print(f"Fetching {session_href} ... ", end="") page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}") page_request.raise_for_status() print("Success! Finding presentations... ", end="") page_soup = BeautifulSoup(page_request.text, "html.parser") if args.save_pages: with open(f"html/{page_name}", "w+", encoding="utf-8") as file: file.write(page_soup.prettify()) print(f"Wrote to file html/{page_name}.") else: print() # import pdb; pdb.set_trace() for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")): presentation_href = presentation_link["href"][1:] presentation_id = presentation_href.split('=')[-1] pres_page_name = f"pres_{presentation_id}.html" if not caching or presentation_id not in cache_list: print(f" Fetching {presentation_href} ...", end="") pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}") pres_request.raise_for_status() print("Success! ", end="") pres_page_soup = BeautifulSoup(pres_request.text, "html.parser") if args.save_pages: with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file: file.write(pres_page_soup.prettify()) pres_page_title = pres_page_soup.h1.string print(f"Found '{pres_page_title}'. Finding PDF...") for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")): pdf_name = pdf_tag["href"].split("/")[-1] file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}" pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}") pdf_request.raise_for_status() if b"Download Quota Exceeded" in pdf_request.content: print("""ATTENTION Infovaya is reporting that you've exceeded your download limit. If you're caching, you can start over from this point tomorrow. If you haven't been caching, you can use the --start-with flag to enter the last SESSION ID above and start from there.""") sys.exit(0) if caching and presentation_id not in cache_list: cache_file.write(f"{presentation_id}\n") print(f"Fetching PDF for '{pres_page_title}'...") if not os.path.isfile(f"pdfs/{file_name}"): with open(f"pdfs/{file_name}", "wb") as file: file.write(pdf_request.content) print(f"Saved as pdfs/{file_name}") time.sleep(2) if caching: cache_file.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages") parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan") parser.add_argument("-c", "--no-cache", action="store_true", help="Don't cache previously-seen pages; re-download everything") parser.add_argument("-w", "--start-with", action="store", help="Start with the given session ID") args = parser.parse_args() print(args) main(args)