|
|
|
|
|
|
|
|
infovaya_creds_request.raise_for_status() |
|
|
infovaya_creds_request.raise_for_status() |
|
|
|
|
|
|
|
|
root_event = 88 if not args.event else args.event |
|
|
root_event = 88 if not args.event else args.event |
|
|
|
|
|
caching = not args.no_cache |
|
|
|
|
|
cache_list = [] |
|
|
|
|
|
if caching: |
|
|
|
|
|
if not os.path.isfile(f"{root_event}_cache.txt"): |
|
|
|
|
|
cache_file = open(f"{root_event}_cache.txt", "w") |
|
|
|
|
|
else: |
|
|
|
|
|
cache_file = open(f"{root_event}_cache.txt", "a") |
|
|
|
|
|
cache_list = cache_file.readlines() |
|
|
|
|
|
|
|
|
|
|
|
skip_until = False if not args.start_with else True |
|
|
|
|
|
start_with = args.start_with |
|
|
|
|
|
|
|
|
sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions") |
|
|
sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions") |
|
|
sessions_request.raise_for_status() |
|
|
sessions_request.raise_for_status() |
|
|
|
|
|
|
|
|
print("Starting to search for PDFs. This may take a while...") |
|
|
print("Starting to search for PDFs. This may take a while...") |
|
|
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): |
|
|
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): |
|
|
session_href = session_link["href"][1:] |
|
|
session_href = session_link["href"][1:] |
|
|
page_name = f"session_{session_href.split('=')[-1]}.html" |
|
|
|
|
|
if not os.path.isfile(f"html/{page_name}"): |
|
|
|
|
|
|
|
|
page_id = session_href.split("=")[-1] |
|
|
|
|
|
page_name = f"session_{page_id}.html" |
|
|
|
|
|
if caching and page_id not in cache_list: |
|
|
|
|
|
cache_file.write(f"{page_id}\n") |
|
|
|
|
|
if skip_until and page_id != start_with: |
|
|
|
|
|
continue |
|
|
|
|
|
skip_until = False |
|
|
|
|
|
if not caching or page_id not in cache_list: |
|
|
time.sleep(1) |
|
|
time.sleep(1) |
|
|
print(f"Fetching {session_href} ... ", end="") |
|
|
print(f"Fetching {session_href} ... ", end="") |
|
|
page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}") |
|
|
page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}") |
|
|
|
|
|
|
|
|
# import pdb; pdb.set_trace() |
|
|
# import pdb; pdb.set_trace() |
|
|
for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")): |
|
|
for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")): |
|
|
presentation_href = presentation_link["href"][1:] |
|
|
presentation_href = presentation_link["href"][1:] |
|
|
pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html" |
|
|
|
|
|
if not os.path.isfile(f"html/{pres_page_name}"): |
|
|
|
|
|
|
|
|
presentation_id = presentation_href.split('=')[-1] |
|
|
|
|
|
pres_page_name = f"pres_{presentation_id}.html" |
|
|
|
|
|
if not caching or presentation_id not in cache_list: |
|
|
print(f" Fetching {presentation_href} ...", end="") |
|
|
print(f" Fetching {presentation_href} ...", end="") |
|
|
pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}") |
|
|
pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}") |
|
|
pres_request.raise_for_status() |
|
|
pres_request.raise_for_status() |
|
|
|
|
|
|
|
|
file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}" |
|
|
file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}" |
|
|
pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}") |
|
|
pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}") |
|
|
pdf_request.raise_for_status() |
|
|
pdf_request.raise_for_status() |
|
|
|
|
|
if b"Download Quota Exceeded" in pdf_request.content: |
|
|
|
|
|
print("""ATTENTION |
|
|
|
|
|
Infovaya is reporting that you've exceeded your download limit. |
|
|
|
|
|
If you're caching, you can start over from this point tomorrow. |
|
|
|
|
|
If you haven't been caching, you can use the --start-with flag to enter the last SESSION ID above and start from there.""") |
|
|
|
|
|
sys.exit(0) |
|
|
|
|
|
if caching and presentation_id not in cache_list: |
|
|
|
|
|
cache_file.write(f"{presentation_id}\n") |
|
|
print(f"Fetching PDF for '{pres_page_title}'...") |
|
|
print(f"Fetching PDF for '{pres_page_title}'...") |
|
|
with open(f"pdfs/{file_name}", "wb") as file: |
|
|
|
|
|
file.write(pdf_request.content) |
|
|
|
|
|
print(f"Saved as pdfs/{file_name}") |
|
|
|
|
|
time.sleep(2) |
|
|
|
|
|
|
|
|
if not os.path.isfile(f"pdfs/{file_name}"): |
|
|
|
|
|
with open(f"pdfs/{file_name}", "wb") as file: |
|
|
|
|
|
file.write(pdf_request.content) |
|
|
|
|
|
print(f"Saved as pdfs/{file_name}") |
|
|
|
|
|
time.sleep(2) |
|
|
|
|
|
if caching: |
|
|
|
|
|
cache_file.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages") |
|
|
parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages") |
|
|
parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan") |
|
|
parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan") |
|
|
|
|
|
parser.add_argument("-c", "--no-cache", action="store_true", help="Don't cache previously-seen pages; re-download everything") |
|
|
|
|
|
parser.add_argument("-w", "--start-with", action="store", help="Start with the given session ID") |
|
|
args = parser.parse_args() |
|
|
args = parser.parse_args() |
|
|
print(args) |
|
|
print(args) |
|
|
main(args) |
|
|
main(args) |