For grabbing PDFs from ICRA 2022
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

main.py 5.9KB

il y a 2 ans
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. import requests, time, re, os, configparser, sys, argparse
  2. from bs4 import BeautifulSoup
  3. from PIL import Image
  4. from io import BytesIO
  5. css_to_change = ["https://www.somethingawful.com/css/main.css?12",
  6. "https://forums.somethingawful.com/css/bbcode.css?1456974408",
  7. "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
  8. "https://www.somethingawful.com/css/globalmenu.css",
  9. "https://www.somethingawful.com/css/forums.css?1545838155"
  10. ]
  11. css_to_change_to = ["main.css",
  12. "bbcode.css",
  13. "jquery-ui.min.css",
  14. "globalmenu.css",
  15. "forums.css"
  16. ]
  17. scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
  18. "https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
  19. "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
  20. "https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
  21. "https://twemoji.maxcdn.com/2/twemoji.min.js",
  22. ]
  23. scripts_to_change_to = ["jquery.min.js",
  24. "jquery-migrate.min.js",
  25. "jquery-ui.min.js",
  26. "forums-combined.js",
  27. "twemoji.min.js"
  28. ]
  29. def main(args):
  30. print(f"Fetching from thread {args.thread}.")
  31. if not os.path.isdir("archive"):
  32. print("First-time setup...")
  33. os.mkdir("archive")
  34. if not os.path.isdir("archive/css"):
  35. print("Setting up CSS...")
  36. os.mkdir("archive/css")
  37. for f in range(len(css_to_change)):
  38. r = requests.get(css_to_change[f])
  39. with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
  40. file.write(r.text)
  41. if not os.path.isdir("archive/scripts"):
  42. print("Setting up scripts...")
  43. os.mkdir("archive/scripts")
  44. for f in range(len(scripts_to_change)):
  45. r = requests.get(scripts_to_change[f])
  46. with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
  47. file.write(r.text)
  48. if not os.path.isdir(f"archive/{args.thread}"):
  49. print(f"Creating directory for {args.thread}...")
  50. os.mkdir(f"archive/{args.thread}")
  51. if not os.path.isdir(f"archive/{args.thread}/images"):
  52. print(f"Creating directory for {args.thread}/images...")
  53. os.mkdir(f"archive/{args.thread}/images")
  54. config = configparser.ConfigParser(interpolation=None)
  55. if not os.path.isfile('config.ini'):
  56. print("config.ini is missing!")
  57. sys.exit(0)
  58. config.read('config.ini')
  59. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  60. print("username and password must be present in config.ini.")
  61. sys.exit(0)
  62. info = { "username": config["DEFAULT"]["username"],
  63. "password": config["DEFAULT"]["password"],
  64. "action": "login"
  65. }
  66. s = requests.Session()
  67. q = s.post("https://forums.somethingawful.com/account.php", data=info)
  68. if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
  69. lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
  70. else:
  71. lastpage = 1
  72. i = lastpage
  73. parse_ok = True
  74. while True:
  75. time.sleep(0.05)
  76. payload = {'threadid': args.thread, 'pagenumber': str(i)}
  77. r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
  78. if "Specified thread was not found in the live forums." in r.text:
  79. print("That thread does not exist or is not accessible to you.")
  80. parse_ok = False
  81. break
  82. if "The page number you requested" in r.text:
  83. i -= 1
  84. break
  85. print(f"Fetching page {i} in thread {args.thread}.")
  86. with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
  87. soup = BeautifulSoup(r.text, "html.parser")
  88. for tag in soup.find_all("link",{"href":True}):
  89. if tag["href"] in css_to_change:
  90. tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
  91. for tag in soup.find_all("script",{"src":True}):
  92. if tag["src"] in scripts_to_change:
  93. tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
  94. for tag in soup.find_all("a",{"title":True}):
  95. if tag["title"] == "Next page":
  96. tag["href"] = f"page{i+1}.html"
  97. if tag["title"] == "Previous page":
  98. tag["href"] = f"page{i-1}.html"
  99. if args.images:
  100. for tag in soup.find_all("img",{"src":True}):
  101. src = tag["src"]
  102. if src[:4] != "http":
  103. src = "https:" + src
  104. imgname = src.split("/")[-1]
  105. fullpath = f"archive/{args.thread}/images/{imgname}"
  106. if os.path.isfile(fullpath):
  107. tag["src"] = f"images/{imgname}"
  108. else:
  109. img = s.get(src, stream=True)
  110. if img.status_code == 200:
  111. try:
  112. theimage = Image.open(BytesIO(img.content))
  113. print(f"\tSaving {fullpath}.")
  114. theimage.save(fullpath)
  115. tag["src"] = f"images/{imgname}"
  116. except:
  117. print(f"\tImage {src} not available.")
  118. else:
  119. print(f"\tImage {src} not available.")
  120. file.write(soup.prettify())
  121. i += 1
  122. print("Finished fetching thread.")
  123. config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
  124. with open("config.ini", "w") as file:
  125. config.write(file)
  126. if __name__ == "__main__":
  127. parser = argparse.ArgumentParser()
  128. parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
  129. parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
  130. args = parser.parse_args()
  131. main(args)