Selaa lähdekoodia

Add caching and resuming

main
Noëlle 1 vuosi sitten
vanhempi
commit
abf4cc8477
No known key found for this signature in database
2 muutettua tiedostoa jossa 48 lisäystä ja 9 poistoa
  1. 9
    1
      README.md
  2. 39
    8
      main.py

+ 9
- 1
README.md Näytä tiedosto



You can use the `-e` flag (e.g. `python3 main.py -e 88`) to determine which event ID to scan for presentations that have PDFs. By default, this is event 88. (The number is unfortunate; it's the event this was written for, [the 39th IEEE International Conference on Robotics and Automation](https://events.infovaya.com/event?id=88), and bears no other symbolism here.) You can use the `-e` flag (e.g. `python3 main.py -e 88`) to determine which event ID to scan for presentations that have PDFs. By default, this is event 88. (The number is unfortunate; it's the event this was written for, [the 39th IEEE International Conference on Robotics and Automation](https://events.infovaya.com/event?id=88), and bears no other symbolism here.)


You can use the `-s` flag (e.g. `python3 main.py -s`) to save the HTML content of each page along with the PDF. This is mostly for diagnostic purposes. The CSS and Javascript files required by the HTML files are included here, but you may have to move them somewhere else to get them to work properly (where depends on your system).
You can use the `-s` flag (e.g. `python3 main.py -s`) to save the HTML content of each page along with the PDF. This is mostly for diagnostic purposes. The CSS and Javascript files required by the HTML files are included here, but you may have to move them somewhere else to get them to work properly (where depends on your system).

You can use the `-c` flag (e.g. `python3 main.py -c`) to disable caching. This script automatically caches which pages it's seen, so it doesn't try to download them again. The `-c` flag will start the whole process over every time you launch the script this way. Note that this will *not* destroy an existing cache.

You can use the `-w` flag (e.g. `python3 main.py -w 76105`) to start from a given **session ID** (*not* a presentation id). This is useful if you've used the `-c` flag and had to interrupt the process, or if you used the script before I added caching. Note that if you have caching enabled (this is the default) and you specify a session ID past where you last left off, the script will add the intermediate sessions to the cache.

Note that you can use any combination of these flags! `python3 main.py -e 74 -s -w 24601` is just fine.

Please contact the author by email at `noelle AT noelle.codes` or by Mastodon at `chat.noelle.codes/@noelle` if you have questions or trouble.

+ 39
- 8
main.py Näytä tiedosto

infovaya_creds_request.raise_for_status() infovaya_creds_request.raise_for_status()


root_event = 88 if not args.event else args.event root_event = 88 if not args.event else args.event
caching = not args.no_cache
cache_list = []
if caching:
if not os.path.isfile(f"{root_event}_cache.txt"):
cache_file = open(f"{root_event}_cache.txt", "w")
else:
cache_file = open(f"{root_event}_cache.txt", "a")
cache_list = cache_file.readlines()

skip_until = False if not args.start_with else True
start_with = args.start_with


sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions") sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions")
sessions_request.raise_for_status() sessions_request.raise_for_status()
print("Starting to search for PDFs. This may take a while...") print("Starting to search for PDFs. This may take a while...")
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
session_href = session_link["href"][1:] session_href = session_link["href"][1:]
page_name = f"session_{session_href.split('=')[-1]}.html"
if not os.path.isfile(f"html/{page_name}"):
page_id = session_href.split("=")[-1]
page_name = f"session_{page_id}.html"
if caching and page_id not in cache_list:
cache_file.write(f"{page_id}\n")
if skip_until and page_id != start_with:
continue
skip_until = False
if not caching or page_id not in cache_list:
time.sleep(1) time.sleep(1)
print(f"Fetching {session_href} ... ", end="") print(f"Fetching {session_href} ... ", end="")
page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}") page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
# import pdb; pdb.set_trace() # import pdb; pdb.set_trace()
for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")): for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
presentation_href = presentation_link["href"][1:] presentation_href = presentation_link["href"][1:]
pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html"
if not os.path.isfile(f"html/{pres_page_name}"):
presentation_id = presentation_href.split('=')[-1]
pres_page_name = f"pres_{presentation_id}.html"
if not caching or presentation_id not in cache_list:
print(f" Fetching {presentation_href} ...", end="") print(f" Fetching {presentation_href} ...", end="")
pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}") pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
pres_request.raise_for_status() pres_request.raise_for_status()
file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}" file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}"
pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}") pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
pdf_request.raise_for_status() pdf_request.raise_for_status()
if b"Download Quota Exceeded" in pdf_request.content:
print("""ATTENTION
Infovaya is reporting that you've exceeded your download limit.
If you're caching, you can start over from this point tomorrow.
If you haven't been caching, you can use the --start-with flag to enter the last SESSION ID above and start from there.""")
sys.exit(0)
if caching and presentation_id not in cache_list:
cache_file.write(f"{presentation_id}\n")
print(f"Fetching PDF for '{pres_page_title}'...") print(f"Fetching PDF for '{pres_page_title}'...")
with open(f"pdfs/{file_name}", "wb") as file:
file.write(pdf_request.content)
print(f"Saved as pdfs/{file_name}")
time.sleep(2)
if not os.path.isfile(f"pdfs/{file_name}"):
with open(f"pdfs/{file_name}", "wb") as file:
file.write(pdf_request.content)
print(f"Saved as pdfs/{file_name}")
time.sleep(2)
if caching:
cache_file.close()


if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages") parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages")
parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan") parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan")
parser.add_argument("-c", "--no-cache", action="store_true", help="Don't cache previously-seen pages; re-download everything")
parser.add_argument("-w", "--start-with", action="store", help="Start with the given session ID")
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
main(args) main(args)

Loading…
Peruuta
Tallenna