For grabbing PDFs from ICRA 2022
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

  1. import requests, time, re, os, configparser, sys, argparse
  2. from bs4 import BeautifulSoup
  3. from PIL import Image
  4. from io import BytesIO
  5. css_to_change = ["",
  6. "",
  7. "",
  8. "",
  9. ""
  10. ]
  11. css_to_change_to = ["main.css",
  12. "bbcode.css",
  13. "jquery-ui.min.css",
  14. "globalmenu.css",
  15. "forums.css"
  16. ]
  17. scripts_to_change = ["",
  18. "",
  19. "",
  20. "",
  21. "",
  22. ]
  23. scripts_to_change_to = ["jquery.min.js",
  24. "jquery-migrate.min.js",
  25. "jquery-ui.min.js",
  26. "forums-combined.js",
  27. "twemoji.min.js"
  28. ]
  29. def main(args):
  30. print(f"Fetching from thread {args.thread}.")
  31. if not os.path.isdir("archive"):
  32. print("First-time setup...")
  33. os.mkdir("archive")
  34. if not os.path.isdir("archive/css"):
  35. print("Setting up CSS...")
  36. os.mkdir("archive/css")
  37. for f in range(len(css_to_change)):
  38. r = requests.get(css_to_change[f])
  39. with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
  40. file.write(r.text)
  41. if not os.path.isdir("archive/scripts"):
  42. print("Setting up scripts...")
  43. os.mkdir("archive/scripts")
  44. for f in range(len(scripts_to_change)):
  45. r = requests.get(scripts_to_change[f])
  46. with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
  47. file.write(r.text)
  48. if not os.path.isdir(f"archive/{args.thread}"):
  49. print(f"Creating directory for {args.thread}...")
  50. os.mkdir(f"archive/{args.thread}")
  51. if not os.path.isdir(f"archive/{args.thread}/images"):
  52. print(f"Creating directory for {args.thread}/images...")
  53. os.mkdir(f"archive/{args.thread}/images")
  54. config = configparser.ConfigParser(interpolation=None)
  55. if not os.path.isfile('config.ini'):
  56. print("config.ini is missing!")
  57. sys.exit(0)
  59. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  60. print("username and password must be present in config.ini.")
  61. sys.exit(0)
  62. info = { "username": config["DEFAULT"]["username"],
  63. "password": config["DEFAULT"]["password"],
  64. "action": "login"
  65. }
  66. s = requests.Session()
  67. q ="", data=info)
  68. if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
  69. lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
  70. else:
  71. lastpage = 1
  72. i = lastpage
  73. parse_ok = True
  74. while True:
  75. time.sleep(0.05)
  76. payload = {'threadid': args.thread, 'pagenumber': str(i)}
  77. r = s.get("", params=payload)
  78. if "Specified thread was not found in the live forums." in r.text:
  79. print("That thread does not exist or is not accessible to you.")
  80. parse_ok = False
  81. break
  82. if "The page number you requested" in r.text:
  83. i -= 1
  84. break
  85. print(f"Fetching page {i} in thread {args.thread}.")
  86. with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
  87. soup = BeautifulSoup(r.text, "html.parser")
  88. for tag in soup.find_all("link",{"href":True}):
  89. if tag["href"] in css_to_change:
  90. tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
  91. for tag in soup.find_all("script",{"src":True}):
  92. if tag["src"] in scripts_to_change:
  93. tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
  94. for tag in soup.find_all("a",{"title":True}):
  95. if tag["title"] == "Next page":
  96. tag["href"] = f"page{i+1}.html"
  97. if tag["title"] == "Previous page":
  98. tag["href"] = f"page{i-1}.html"
  99. if args.images:
  100. for tag in soup.find_all("img",{"src":True}):
  101. src = tag["src"]
  102. if src[:4] != "http":
  103. src = "https:" + src
  104. imgname = src.split("/")[-1]
  105. fullpath = f"archive/{args.thread}/images/{imgname}"
  106. if os.path.isfile(fullpath):
  107. tag["src"] = f"images/{imgname}"
  108. else:
  109. img = s.get(src, stream=True)
  110. if img.status_code == 200:
  111. try:
  112. theimage =
  113. print(f"\tSaving {fullpath}.")
  115. tag["src"] = f"images/{imgname}"
  116. except:
  117. print(f"\tImage {src} not available.")
  118. else:
  119. print(f"\tImage {src} not available.")
  120. file.write(soup.prettify())
  121. i += 1
  122. print("Finished fetching thread.")
  123. config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
  124. with open("config.ini", "w") as file:
  125. config.write(file)
  126. if __name__ == "__main__":
  127. parser = argparse.ArgumentParser()
  128. parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
  129. parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
  130. args = parser.parse_args()
  131. main(args)