| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 | 
							- import requests, time, re, os, configparser, sys, argparse, shutil
 - from bs4 import BeautifulSoup
 - 
 - def main(args: dict) -> None:
 -   print(f"Fetching PDFs.")
 -   if not os.path.isdir("pdfs"):
 -     print("Setting up PDFs directory.")
 -     os.mkdir("pdfs")
 -   if args.save_pages and not os.path.isdir("html"):
 -     print("Setting up HTML directory.")
 -     os.mkdir("html")
 -     
 -   config = configparser.ConfigParser(interpolation=None)
 -   if not os.path.isfile('config.ini'):
 -     print("config.ini is missing!")
 -     sys.exit(0)
 -   config.read('config.ini')
 - 
 -   if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
 -     print("username and password must be present in config.ini.")
 -     sys.exit(0)
 - 
 -   INFOVAYA_ROOT = "https://events.infovaya.com/"
 - 
 -   info = { "username": config["DEFAULT"]["username"],
 -           "password": config["DEFAULT"]["password"],
 -           "action": "login"
 -           }
 - 
 -   infovaya_session = requests.Session()
 -   infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
 -   infovaya_creds_request.raise_for_status()
 - 
 -   root_event = 88 if not args.event else args.event
 -   caching = not args.no_cache
 -   cache_list = []
 -   if caching:
 -     if not os.path.isfile(f"{root_event}_cache.txt"):
 -       cache_file = open(f"{root_event}_cache.txt", "w")
 -     else:
 -       cache_file = open(f"{root_event}_cache.txt", "a")
 -       cache_list = cache_file.readlines()
 - 
 -   skip_until = False if not args.start_with else True
 -   start_with = args.start_with
 - 
 -   sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions")
 -   sessions_request.raise_for_status()
 -   sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
 -   if args.save_pages:
 -     with open("html/main.html", "w+", encoding="utf-8") as file:
 -       file.write(sessions_page_soup.prettify())
 -   print("Starting to search for PDFs. This may take a while...")
 -   for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
 -     session_href = session_link["href"][1:]
 -     page_id = session_href.split("=")[-1]
 -     page_name = f"session_{page_id}.html"
 -     if caching and page_id not in cache_list:
 -       cache_file.write(f"{page_id}\n")
 -     if skip_until and page_id != start_with:
 -       continue
 -     skip_until = False
 -     if not caching or page_id not in cache_list:
 -       time.sleep(1)
 -       print(f"Fetching {session_href} ... ", end="")
 -       page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
 -       page_request.raise_for_status()
 -       print("Success! Finding presentations... ", end="")
 -       page_soup = BeautifulSoup(page_request.text, "html.parser")
 -       if args.save_pages:
 -         with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
 -           file.write(page_soup.prettify())
 -         print(f"Wrote to file html/{page_name}.")
 -       else:
 -         print()
 -         # import pdb; pdb.set_trace()
 -       for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
 -         presentation_href = presentation_link["href"][1:]
 -         presentation_id = presentation_href.split('=')[-1]
 -         pres_page_name = f"pres_{presentation_id}.html"
 -         if not caching or presentation_id not in cache_list:
 -           print(f"  Fetching {presentation_href} ...", end="")
 -           pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
 -           pres_request.raise_for_status()
 -           print("Success! ", end="")
 -           pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
 -           if args.save_pages:
 -             with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
 -               file.write(pres_page_soup.prettify())
 -           pres_page_title = pres_page_soup.h1.string
 -           print(f"Found '{pres_page_title}'. Finding PDF...")
 -           for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
 -             pdf_name = pdf_tag["href"].split("/")[-1]
 -             file_name = f"{pres_page_title.replace(' ', '_').replace('/', '-')}-{pdf_name}"
 -             pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
 -             pdf_request.raise_for_status()
 -             if b"Download Quota Exceeded" in pdf_request.content:
 -               print("""ATTENTION
 -               Infovaya is reporting that you've exceeded your download limit.
 -               If you're caching, you can start over from this point tomorrow.
 -               If you haven't been caching, you can use the --start-with flag to enter the last SESSION ID above and start from there.""")
 -               sys.exit(0)
 -             if caching and presentation_id not in cache_list:
 -               cache_file.write(f"{presentation_id}\n")
 -             print(f"Fetching PDF for '{pres_page_title}'...")
 -             if not os.path.isfile(f"pdfs/{file_name}"):
 -               with open(f"pdfs/{file_name}", "wb") as file:
 -                 file.write(pdf_request.content)
 -                 print(f"Saved as pdfs/{file_name}")
 -               time.sleep(2)
 -   if caching:
 -     cache_file.close()
 -   
 - 
 - if __name__ == "__main__":
 -   parser = argparse.ArgumentParser()
 -   parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages")
 -   parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan")
 -   parser.add_argument("-c", "--no-cache", action="store_true", help="Don't cache previously-seen pages; re-download everything")
 -   parser.add_argument("-w", "--start-with", action="store", help="Start with the given session ID")
 -   args = parser.parse_args()
 -   print(args)
 -   main(args)
 
 
  |