For grabbing PDFs from ICRA 2022
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

main.py 5.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. import requests, time, re, os, configparser, sys, argparse
  2. from bs4 import BeautifulSoup
  3. from PIL import Image
  4. from io import BytesIO
  5. css_to_change = ["https://www.somethingawful.com/css/main.css?12",
  6. "https://forums.somethingawful.com/css/bbcode.css?1456974408",
  7. "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
  8. "https://www.somethingawful.com/css/globalmenu.css",
  9. "https://www.somethingawful.com/css/forums.css?1545838155"
  10. ]
  11. css_to_change_to = ["main.css",
  12. "bbcode.css",
  13. "jquery-ui.min.css",
  14. "globalmenu.css",
  15. "forums.css"
  16. ]
  17. scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
  18. "https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
  19. "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
  20. "https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
  21. "https://twemoji.maxcdn.com/2/twemoji.min.js",
  22. ]
  23. scripts_to_change_to = ["jquery.min.js",
  24. "jquery-migrate.min.js",
  25. "jquery-ui.min.js",
  26. "forums-combined.js",
  27. "twemoji.min.js"
  28. ]
  29. def main(args):
  30. print(f"Fetching from thread {args.thread}.")
  31. if not os.path.isdir("archive"):
  32. print("First-time setup...")
  33. os.mkdir("archive")
  34. if not os.path.isdir("archive/css"):
  35. print("Setting up CSS...")
  36. os.mkdir("archive/css")
  37. for f in range(len(css_to_change)):
  38. r = requests.get(css_to_change[f])
  39. with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
  40. file.write(r.text)
  41. if not os.path.isdir("archive/scripts"):
  42. print("Setting up scripts...")
  43. os.mkdir("archive/scripts")
  44. for f in range(len(scripts_to_change)):
  45. r = requests.get(scripts_to_change[f])
  46. with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
  47. file.write(r.text)
  48. if not os.path.isdir(f"archive/{args.thread}"):
  49. print(f"Creating directory for {args.thread}...")
  50. os.mkdir(f"archive/{args.thread}")
  51. if not os.path.isdir(f"archive/{args.thread}/images"):
  52. print(f"Creating directory for {args.thread}/images...")
  53. os.mkdir(f"archive/{args.thread}/images")
  54. config = configparser.ConfigParser(interpolation=None)
  55. if not os.path.isfile('config.ini'):
  56. print("config.ini is missing!")
  57. sys.exit(0)
  58. config.read('config.ini')
  59. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  60. print("username and password must be present in config.ini.")
  61. sys.exit(0)
  62. info = { "username": config["DEFAULT"]["username"],
  63. "password": config["DEFAULT"]["password"],
  64. "action": "login"
  65. }
  66. s = requests.Session()
  67. q = s.post("https://forums.somethingawful.com/account.php", data=info)
  68. if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
  69. lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
  70. else:
  71. lastpage = 1
  72. i = lastpage
  73. parse_ok = True
  74. while True:
  75. time.sleep(0.05)
  76. payload = {'threadid': args.thread, 'pagenumber': str(i)}
  77. r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
  78. if "Specified thread was not found in the live forums." in r.text:
  79. print("That thread does not exist or is not accessible to you.")
  80. parse_ok = False
  81. break
  82. if "The page number you requested" in r.text:
  83. i -= 1
  84. break
  85. print(f"Fetching page {i} in thread {args.thread}.")
  86. with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
  87. soup = BeautifulSoup(r.text, "html.parser")
  88. for tag in soup.find_all("link",{"href":True}):
  89. if tag["href"] in css_to_change:
  90. tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
  91. for tag in soup.find_all("script",{"src":True}):
  92. if tag["src"] in scripts_to_change:
  93. tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
  94. for tag in soup.find_all("a",{"title":True}):
  95. if tag["title"] == "Next page":
  96. tag["href"] = f"page{i+1}.html"
  97. if tag["title"] == "Previous page":
  98. tag["href"] = f"page{i-1}.html"
  99. if args.images:
  100. for tag in soup.find_all("img",{"src":True}):
  101. src = tag["src"]
  102. if src[:4] != "http":
  103. src = "https:" + src
  104. imgname = src.split("/")[-1]
  105. fullpath = f"archive/{args.thread}/images/{imgname}"
  106. if os.path.isfile(fullpath):
  107. tag["src"] = f"images/{imgname}"
  108. else:
  109. img = s.get(src, stream=True)
  110. if img.status_code == 200:
  111. try:
  112. theimage = Image.open(BytesIO(img.content))
  113. print(f"\tSaving {fullpath}.")
  114. theimage.save(fullpath)
  115. tag["src"] = f"images/{imgname}"
  116. except:
  117. print(f"\tImage {src} not available.")
  118. else:
  119. print(f"\tImage {src} not available.")
  120. file.write(soup.prettify())
  121. i += 1
  122. print("Finished fetching thread.")
  123. config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
  124. with open("config.ini", "w") as file:
  125. config.write(file)
  126. if __name__ == "__main__":
  127. parser = argparse.ArgumentParser()
  128. parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
  129. parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
  130. args = parser.parse_args()
  131. main(args)