For grabbing PDFs from ICRA 2022
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import requests, time, re, os, configparser, sys, argparse, shutil
  2. from bs4 import BeautifulSoup
  3. def main(args: dict) -> None:
  4. print(f"Fetching PDFs.")
  5. if not os.path.isdir("pdfs"):
  6. print("Setting up PDFs directory.")
  7. os.mkdir("pdfs")
  8. if args.save_pages and not os.path.isdir("html"):
  9. print("Setting up HTML directory.")
  10. os.mkdir("html")
  11. config = configparser.ConfigParser(interpolation=None)
  12. if not os.path.isfile('config.ini'):
  13. print("config.ini is missing!")
  14. sys.exit(0)
  15. config.read('config.ini')
  16. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  17. print("username and password must be present in config.ini.")
  18. sys.exit(0)
  19. INFOVAYA_ROOT = "https://events.infovaya.com/"
  20. info = { "username": config["DEFAULT"]["username"],
  21. "password": config["DEFAULT"]["password"],
  22. "action": "login"
  23. }
  24. infovaya_session = requests.Session()
  25. infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
  26. infovaya_creds_request.raise_for_status()
  27. root_event = 88 if not args.event else args.event
  28. caching = not args.no_cache
  29. cache_list = []
  30. if caching:
  31. if not os.path.isfile(f"{root_event}_cache.txt"):
  32. cache_file = open(f"{root_event}_cache.txt", "w")
  33. else:
  34. cache_file = open(f"{root_event}_cache.txt", "a")
  35. cache_list = cache_file.readlines()
  36. skip_until = False if not args.start_with else True
  37. start_with = args.start_with
  38. sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions")
  39. sessions_request.raise_for_status()
  40. sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
  41. if args.save_pages:
  42. with open("html/main.html", "w+", encoding="utf-8") as file:
  43. file.write(sessions_page_soup.prettify())
  44. print("Starting to search for PDFs. This may take a while...")
  45. for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
  46. session_href = session_link["href"][1:]
  47. page_id = session_href.split("=")[-1]
  48. page_name = f"session_{page_id}.html"
  49. if caching and page_id not in cache_list:
  50. cache_file.write(f"{page_id}\n")
  51. if skip_until and page_id != start_with:
  52. continue
  53. skip_until = False
  54. if not caching or page_id not in cache_list:
  55. time.sleep(1)
  56. print(f"Fetching {session_href} ... ", end="")
  57. page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
  58. page_request.raise_for_status()
  59. print("Success! Finding presentations... ", end="")
  60. page_soup = BeautifulSoup(page_request.text, "html.parser")
  61. if args.save_pages:
  62. with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
  63. file.write(page_soup.prettify())
  64. print(f"Wrote to file html/{page_name}.")
  65. else:
  66. print()
  67. # import pdb; pdb.set_trace()
  68. for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
  69. presentation_href = presentation_link["href"][1:]
  70. presentation_id = presentation_href.split('=')[-1]
  71. pres_page_name = f"pres_{presentation_id}.html"
  72. if not caching or presentation_id not in cache_list:
  73. print(f" Fetching {presentation_href} ...", end="")
  74. pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
  75. pres_request.raise_for_status()
  76. print("Success! ", end="")
  77. pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
  78. if args.save_pages:
  79. with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
  80. file.write(pres_page_soup.prettify())
  81. pres_page_title = pres_page_soup.h1.string
  82. print(f"Found '{pres_page_title}'. Finding PDF...")
  83. for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
  84. pdf_name = pdf_tag["href"].split("/")[-1]
  85. file_name = f"{pres_page_title.replace(' ', '_').replace('/', '-')}-{pdf_name}"
  86. pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
  87. pdf_request.raise_for_status()
  88. if b"Download Quota Exceeded" in pdf_request.content:
  89. print("""ATTENTION
  90. Infovaya is reporting that you've exceeded your download limit.
  91. If you're caching, you can start over from this point tomorrow.
  92. If you haven't been caching, you can use the --start-with flag to enter the last SESSION ID above and start from there.""")
  93. sys.exit(0)
  94. if caching and presentation_id not in cache_list:
  95. cache_file.write(f"{presentation_id}\n")
  96. print(f"Fetching PDF for '{pres_page_title}'...")
  97. if not os.path.isfile(f"pdfs/{file_name}"):
  98. with open(f"pdfs/{file_name}", "wb") as file:
  99. file.write(pdf_request.content)
  100. print(f"Saved as pdfs/{file_name}")
  101. time.sleep(2)
  102. if caching:
  103. cache_file.close()
  104. if __name__ == "__main__":
  105. parser = argparse.ArgumentParser()
  106. parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages")
  107. parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan")
  108. parser.add_argument("-c", "--no-cache", action="store_true", help="Don't cache previously-seen pages; re-download everything")
  109. parser.add_argument("-w", "--start-with", action="store", help="Start with the given session ID")
  110. args = parser.parse_args()
  111. print(args)
  112. main(args)