For grabbing PDFs from ICRA 2022
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

main.py 5.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import requests, time, re, os, configparser, sys, argparse, shutil
  2. from bs4 import BeautifulSoup
  3. def main(args: dict) -> None:
  4. print(f"Fetching PDFs.")
  5. if not os.path.isdir("pdfs"):
  6. print("Setting up PDFs directory.")
  7. os.mkdir("pdfs")
  8. if args.save_pages and not os.path.isdir("html"):
  9. print("Setting up HTML directory.")
  10. os.mkdir("html")
  11. config = configparser.ConfigParser(interpolation=None)
  12. if not os.path.isfile('config.ini'):
  13. print("config.ini is missing!")
  14. sys.exit(0)
  15. config.read('config.ini')
  16. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  17. print("username and password must be present in config.ini.")
  18. sys.exit(0)
  19. INFOVAYA_ROOT = "https://events.infovaya.com/"
  20. info = { "username": config["DEFAULT"]["username"],
  21. "password": config["DEFAULT"]["password"],
  22. "action": "login"
  23. }
  24. infovaya_session = requests.Session()
  25. infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
  26. infovaya_creds_request.raise_for_status()
  27. root_event = 88 if not args.event else args.event
  28. caching = not args.no_cache
  29. cache_list = []
  30. if caching:
  31. if not os.path.isfile(f"{root_event}_cache.txt"):
  32. cache_file = open(f"{root_event}_cache.txt", "w")
  33. else:
  34. cache_file = open(f"{root_event}_cache.txt", "a")
  35. cache_list = cache_file.readlines()
  36. skip_until = False if not args.start_with else True
  37. start_with = args.start_with
  38. sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions")
  39. sessions_request.raise_for_status()
  40. sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
  41. if args.save_pages:
  42. with open("html/main.html", "w+", encoding="utf-8") as file:
  43. file.write(sessions_page_soup.prettify())
  44. print("Starting to search for PDFs. This may take a while...")
  45. for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
  46. session_href = session_link["href"][1:]
  47. page_id = session_href.split("=")[-1]
  48. page_name = f"session_{page_id}.html"
  49. if caching and page_id not in cache_list:
  50. cache_file.write(f"{page_id}\n")
  51. if skip_until and page_id != start_with:
  52. continue
  53. skip_until = False
  54. if not caching or page_id not in cache_list:
  55. time.sleep(1)
  56. print(f"Fetching {session_href} ... ", end="")
  57. page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
  58. page_request.raise_for_status()
  59. print("Success! Finding presentations... ", end="")
  60. page_soup = BeautifulSoup(page_request.text, "html.parser")
  61. if args.save_pages:
  62. with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
  63. file.write(page_soup.prettify())
  64. print(f"Wrote to file html/{page_name}.")
  65. else:
  66. print()
  67. # import pdb; pdb.set_trace()
  68. for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
  69. presentation_href = presentation_link["href"][1:]
  70. presentation_id = presentation_href.split('=')[-1]
  71. pres_page_name = f"pres_{presentation_id}.html"
  72. if not caching or presentation_id not in cache_list:
  73. print(f" Fetching {presentation_href} ...", end="")
  74. pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
  75. pres_request.raise_for_status()
  76. print("Success! ", end="")
  77. pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
  78. if args.save_pages:
  79. with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
  80. file.write(pres_page_soup.prettify())
  81. pres_page_title = pres_page_soup.h1.string
  82. print(f"Found '{pres_page_title}'. Finding PDF...")
  83. for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
  84. pdf_name = pdf_tag["href"].split("/")[-1]
  85. file_name = f"{pres_page_title.replace(' ', '_').replace('/', '-')}-{pdf_name}"
  86. pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
  87. pdf_request.raise_for_status()
  88. if b"Download Quota Exceeded" in pdf_request.content:
  89. print("""ATTENTION
  90. Infovaya is reporting that you've exceeded your download limit.
  91. If you're caching, you can start over from this point tomorrow.
  92. If you haven't been caching, you can use the --start-with flag to enter the last SESSION ID above and start from there.""")
  93. sys.exit(0)
  94. if caching and presentation_id not in cache_list:
  95. cache_file.write(f"{presentation_id}\n")
  96. print(f"Fetching PDF for '{pres_page_title}'...")
  97. if not os.path.isfile(f"pdfs/{file_name}"):
  98. with open(f"pdfs/{file_name}", "wb") as file:
  99. file.write(pdf_request.content)
  100. print(f"Saved as pdfs/{file_name}")
  101. time.sleep(2)
  102. if caching:
  103. cache_file.close()
  104. if __name__ == "__main__":
  105. parser = argparse.ArgumentParser()
  106. parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages")
  107. parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan")
  108. parser.add_argument("-c", "--no-cache", action="store_true", help="Don't cache previously-seen pages; re-download everything")
  109. parser.add_argument("-w", "--start-with", action="store", help="Start with the given session ID")
  110. args = parser.parse_args()
  111. print(args)
  112. main(args)