For grabbing PDFs from ICRA 2022
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import requests, time, re, os, configparser, sys, argparse, shutil
  2. from bs4 import BeautifulSoup
  3. def main(args: dict) -> None:
  4. print(f"Fetching PDFs.")
  5. if not os.path.isdir("pdfs"):
  6. print("Setting up PDFs directory.")
  7. os.mkdir("pdfs")
  8. if args.save_pages and not os.path.isdir("html"):
  9. print("Setting up HTML directory.")
  10. os.mkdir("html")
  11. config = configparser.ConfigParser(interpolation=None)
  12. if not os.path.isfile('config.ini'):
  13. print("config.ini is missing!")
  14. sys.exit(0)
  15. config.read('config.ini')
  16. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  17. print("username and password must be present in config.ini.")
  18. sys.exit(0)
  19. INFOVAYA_ROOT = "https://events.infovaya.com/"
  20. info = { "username": config["DEFAULT"]["username"],
  21. "password": config["DEFAULT"]["password"],
  22. "action": "login"
  23. }
  24. infovaya_session = requests.Session()
  25. infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
  26. infovaya_creds_request.raise_for_status()
  27. sessions_request = infovaya_session.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
  28. sessions_request.raise_for_status()
  29. sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
  30. if args.save_pages:
  31. with open("html/main.html", "w+", encoding="utf-8") as file:
  32. file.write(sessions_page_soup.prettify())
  33. print("Starting to search for PDFs. This may take a while...")
  34. for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
  35. session_href = session_link["href"][1:]
  36. page_name = f"session_{session_href.split('=')[-1]}.html"
  37. if not os.path.isfile(f"html/{page_name}"):
  38. time.sleep(1)
  39. print(f"Fetching {session_href} ... ", end="")
  40. page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
  41. page_request.raise_for_status()
  42. print("Success! Finding presentations... ", end="")
  43. page_soup = BeautifulSoup(page_request.text, "html.parser")
  44. if args.save_pages:
  45. with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
  46. file.write(page_soup.prettify())
  47. print(f"Wrote to file html/{page_name}.")
  48. else:
  49. print()
  50. # import pdb; pdb.set_trace()
  51. for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
  52. presentation_href = presentation_link["href"][1:]
  53. pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html"
  54. if not os.path.isfile(f"html/{pres_page_name}"):
  55. print(f" Fetching {presentation_href} ...", end="")
  56. pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
  57. pres_request.raise_for_status()
  58. print("Success! ", end="")
  59. pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
  60. if args.save_pages:
  61. with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
  62. file.write(pres_page_soup.prettify())
  63. pres_page_title = pres_page_soup.h1.string
  64. print(f"Found '{pres_page_title}'. Finding PDF...")
  65. for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
  66. pdf_name = pdf_tag["href"].split("/")[-1]
  67. file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}"
  68. pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
  69. pdf_request.raise_for_status()
  70. print(f"Fetching PDF for '{pres_page_title}'...")
  71. with open(f"pdfs/{file_name}", "wb") as file:
  72. file.write(pdf_request.content)
  73. print(f"Saved as pdfs/{file_name}")
  74. time.sleep(2)
  75. if __name__ == "__main__":
  76. parser = argparse.ArgumentParser()
  77. parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of ")
  78. # parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
  79. args = parser.parse_args()
  80. print(args)
  81. main(args)