For grabbing PDFs from ICRA 2022
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

main.py 4.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import requests, time, re, os, configparser, sys, argparse, shutil
  2. from bs4 import BeautifulSoup
  3. def main(args: dict) -> None:
  4. print(f"Fetching PDFs.")
  5. if not os.path.isdir("pdfs"):
  6. print("Setting up PDFs directory.")
  7. os.mkdir("pdfs")
  8. if args.save_pages and not os.path.isdir("html"):
  9. print("Setting up HTML directory.")
  10. os.mkdir("html")
  11. config = configparser.ConfigParser(interpolation=None)
  12. if not os.path.isfile('config.ini'):
  13. print("config.ini is missing!")
  14. sys.exit(0)
  15. config.read('config.ini')
  16. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  17. print("username and password must be present in config.ini.")
  18. sys.exit(0)
  19. INFOVAYA_ROOT = "https://events.infovaya.com/"
  20. info = { "username": config["DEFAULT"]["username"],
  21. "password": config["DEFAULT"]["password"],
  22. "action": "login"
  23. }
  24. infovaya_session = requests.Session()
  25. infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
  26. infovaya_creds_request.raise_for_status()
  27. root_event = 88 if not args.event else args.event
  28. sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions")
  29. sessions_request.raise_for_status()
  30. sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
  31. if args.save_pages:
  32. with open("html/main.html", "w+", encoding="utf-8") as file:
  33. file.write(sessions_page_soup.prettify())
  34. print("Starting to search for PDFs. This may take a while...")
  35. for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
  36. session_href = session_link["href"][1:]
  37. page_name = f"session_{session_href.split('=')[-1]}.html"
  38. if not os.path.isfile(f"html/{page_name}"):
  39. time.sleep(1)
  40. print(f"Fetching {session_href} ... ", end="")
  41. page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
  42. page_request.raise_for_status()
  43. print("Success! Finding presentations... ", end="")
  44. page_soup = BeautifulSoup(page_request.text, "html.parser")
  45. if args.save_pages:
  46. with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
  47. file.write(page_soup.prettify())
  48. print(f"Wrote to file html/{page_name}.")
  49. else:
  50. print()
  51. # import pdb; pdb.set_trace()
  52. for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
  53. presentation_href = presentation_link["href"][1:]
  54. pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html"
  55. if not os.path.isfile(f"html/{pres_page_name}"):
  56. print(f" Fetching {presentation_href} ...", end="")
  57. pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
  58. pres_request.raise_for_status()
  59. print("Success! ", end="")
  60. pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
  61. if args.save_pages:
  62. with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
  63. file.write(pres_page_soup.prettify())
  64. pres_page_title = pres_page_soup.h1.string
  65. print(f"Found '{pres_page_title}'. Finding PDF...")
  66. for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
  67. pdf_name = pdf_tag["href"].split("/")[-1]
  68. file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}"
  69. pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
  70. pdf_request.raise_for_status()
  71. print(f"Fetching PDF for '{pres_page_title}'...")
  72. with open(f"pdfs/{file_name}", "wb") as file:
  73. file.write(pdf_request.content)
  74. print(f"Saved as pdfs/{file_name}")
  75. time.sleep(2)
  76. if __name__ == "__main__":
  77. parser = argparse.ArgumentParser()
  78. parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages")
  79. parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan")
  80. args = parser.parse_args()
  81. print(args)
  82. main(args)