For grabbing PDFs from ICRA 2022
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
hace 2 años
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import requests, time, re, os, configparser, sys, argparse, shutil
  2. from bs4 import BeautifulSoup
  3. def main(args: dict) -> None:
  4. print(f"Fetching PDFs.")
  5. if not os.path.isdir("pdfs"):
  6. print("Setting up PDFs directory.")
  7. os.mkdir("pdfs")
  8. if args.save_pages and not os.path.isdir("html"):
  9. print("Setting up HTML directory.")
  10. os.mkdir("html")
  11. config = configparser.ConfigParser(interpolation=None)
  12. if not os.path.isfile('config.ini'):
  13. print("config.ini is missing!")
  14. sys.exit(0)
  15. config.read('config.ini')
  16. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  17. print("username and password must be present in config.ini.")
  18. sys.exit(0)
  19. INFOVAYA_ROOT = "https://events.infovaya.com/"
  20. info = { "username": config["DEFAULT"]["username"],
  21. "password": config["DEFAULT"]["password"],
  22. "action": "login"
  23. }
  24. infovaya_session = requests.Session()
  25. infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
  26. infovaya_creds_request.raise_for_status()
  27. root_event = 88 if not args.event else args.event
  28. sessions_request = infovaya_session.get(f"{INFOVAYA_ROOT}event?id={root_event}&actionMenu=sessions")
  29. sessions_request.raise_for_status()
  30. sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
  31. if args.save_pages:
  32. with open("html/main.html", "w+", encoding="utf-8") as file:
  33. file.write(sessions_page_soup.prettify())
  34. print("Starting to search for PDFs. This may take a while...")
  35. for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
  36. session_href = session_link["href"][1:]
  37. page_name = f"session_{session_href.split('=')[-1]}.html"
  38. if not os.path.isfile(f"html/{page_name}"):
  39. time.sleep(1)
  40. print(f"Fetching {session_href} ... ", end="")
  41. page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
  42. page_request.raise_for_status()
  43. print("Success! Finding presentations... ", end="")
  44. page_soup = BeautifulSoup(page_request.text, "html.parser")
  45. if args.save_pages:
  46. with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
  47. file.write(page_soup.prettify())
  48. print(f"Wrote to file html/{page_name}.")
  49. else:
  50. print()
  51. # import pdb; pdb.set_trace()
  52. for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
  53. presentation_href = presentation_link["href"][1:]
  54. pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html"
  55. if not os.path.isfile(f"html/{pres_page_name}"):
  56. print(f" Fetching {presentation_href} ...", end="")
  57. pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
  58. pres_request.raise_for_status()
  59. print("Success! ", end="")
  60. pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
  61. if args.save_pages:
  62. with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
  63. file.write(pres_page_soup.prettify())
  64. pres_page_title = pres_page_soup.h1.string
  65. print(f"Found '{pres_page_title}'. Finding PDF...")
  66. for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
  67. pdf_name = pdf_tag["href"].split("/")[-1]
  68. file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}"
  69. pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
  70. pdf_request.raise_for_status()
  71. print(f"Fetching PDF for '{pres_page_title}'...")
  72. with open(f"pdfs/{file_name}", "wb") as file:
  73. file.write(pdf_request.content)
  74. print(f"Saved as pdfs/{file_name}")
  75. time.sleep(2)
  76. if __name__ == "__main__":
  77. parser = argparse.ArgumentParser()
  78. parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of the scanned pages")
  79. parser.add_argument("-e", "--event", action="store_true", help="Choose which event ID to scan")
  80. args = parser.parse_args()
  81. print(args)
  82. main(args)