For grabbing PDFs from ICRA 2022
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

main.py 2.5KB

2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
2 yıl önce
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import requests, time, re, os, configparser, sys, argparse, shutil
  2. from bs4 import BeautifulSoup
  3. def main(args):
  4. print(f"Fetching PDFs.")
  5. if not os.path.isdir("pdfs"):
  6. print("First-time setup...")
  7. os.mkdir("pdfs")
  8. config = configparser.ConfigParser(interpolation=None)
  9. if not os.path.isfile('config.ini'):
  10. print("config.ini is missing!")
  11. sys.exit(0)
  12. config.read('config.ini')
  13. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  14. print("username and password must be present in config.ini.")
  15. sys.exit(0)
  16. INFOVAYA_ROOT = "https://events.infovaya.com/"
  17. info = { "username": config["DEFAULT"]["username"],
  18. "password": config["DEFAULT"]["password"],
  19. "action": "login"
  20. }
  21. s = requests.Session()
  22. q = s.post("{}login".format(INFOVAYA_ROOT), data=info)
  23. q.raise_for_status()
  24. sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
  25. sessions_request.raise_for_status()
  26. sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
  27. print("Starting to search for PDFs. This may take a while...")
  28. for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
  29. time.sleep(0.05)
  30. page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"]))
  31. page_request.raise_for_status()
  32. page_soup = BeautifulSoup(page_request.text, "html.parser")
  33. page_title = page.soup.h1.string
  34. for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")):
  35. pdf_name = pdf_tag["href"].split("/")[-1]
  36. file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name)
  37. pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True)
  38. pdf_request.raise_for_status()
  39. print("Fetching PDF for '{}'...".format(page_title))
  40. with open(file_name, "wb") as file:
  41. pdf_request.raw.decode_content = True
  42. shutil.copyfileobj(pdf_request.raw, file)
  43. print("Saved as pdfs/{}".format(file_name))
  44. print("Finished fetching thread.")
  45. if __name__ == "__main__":
  46. parser = argparse.ArgumentParser()
  47. parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
  48. parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
  49. args = parser.parse_args()
  50. main(args)