For grabbing PDFs from ICRA 2022
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

main.py 2.5KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import requests, time, re, os, configparser, sys, argparse, shutil
  2. from bs4 import BeautifulSoup
  3. def main(args):
  4. print(f"Fetching PDFs.")
  5. if not os.path.isdir("pdfs"):
  6. print("First-time setup...")
  7. os.mkdir("pdfs")
  8. config = configparser.ConfigParser(interpolation=None)
  9. if not os.path.isfile('config.ini'):
  10. print("config.ini is missing!")
  11. sys.exit(0)
  12. config.read('config.ini')
  13. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  14. print("username and password must be present in config.ini.")
  15. sys.exit(0)
  16. INFOVAYA_ROOT = "https://events.infovaya.com/"
  17. info = { "username": config["DEFAULT"]["username"],
  18. "password": config["DEFAULT"]["password"],
  19. "action": "login"
  20. }
  21. s = requests.Session()
  22. q = s.post("{}login".format(INFOVAYA_ROOT), data=info)
  23. q.raise_for_status()
  24. sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
  25. sessions_request.raise_for_status()
  26. sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
  27. print("Starting to search for PDFs. This may take a while...")
  28. for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
  29. time.sleep(0.05)
  30. page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"]))
  31. page_request.raise_for_status()
  32. page_soup = BeautifulSoup(page_request.text, "html.parser")
  33. page_title = page.soup.h1.string
  34. for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")):
  35. pdf_name = pdf_tag["href"].split("/")[-1]
  36. file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name)
  37. pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True)
  38. pdf_request.raise_for_status()
  39. print("Fetching PDF for '{}'...".format(page_title))
  40. with open(file_name, "wb") as file:
  41. pdf_request.raw.decode_content = True
  42. shutil.copyfileobj(pdf_request.raw, file)
  43. print("Saved as pdfs/{}".format(file_name))
  44. print("Finished fetching thread.")
  45. if __name__ == "__main__":
  46. parser = argparse.ArgumentParser()
  47. parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
  48. parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
  49. args = parser.parse_args()
  50. main(args)