For grabbing PDFs from ICRA 2022
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper.py 2.3KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import requests, time, re, os, configparser, sys
  2. from bs4 import BeautifulSoup
  3. config = configparser.ConfigParser()
  4. if not os.path.isfile('config.ini'):
  5. print("config.ini is missing!")
  6. sys.exit(0)
  7. config.read('config.ini')
  8. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  9. print("username and password must be present in config.ini.")
  10. sys.exit(0)
  11. info = { "username": config["DEFAULT"]["username"],
  12. "password": config["DEFAULT"]["password"],
  13. "action": "login"
  14. }
  15. # files = [f for f in os.listdir('pages') if os.path.isfile("./pages/{}".format(f))]
  16. if not os.path.isdir('pages'):
  17. os.mkdir('pages', 0o755)
  18. s = requests.Session()
  19. q = s.post("https://forums.somethingawful.com/account.php", data=info)
  20. # print(q.text)
  21. if "lastpage" in config["DEFAULT"] and config["DEFAULT"]["lastpage"] != "":
  22. lastpage = int(config["DEFAULT"]["lastpage"])
  23. else:
  24. lastpage = 1
  25. i = lastpage
  26. while True:
  27. time.sleep(0.1)
  28. payload = {'threadid': '3908778', 'pagenumber': str(i)}
  29. r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) #, cookies=jar)
  30. # with open("pages/rawpage{}.txt".format(i), "w+") as file:
  31. # file.write(r.text)
  32. if "The page number you requested" in r.text:
  33. i -= 1
  34. break
  35. matcher = re.compile(r'[g]aybie[s]? [n]om\S{0,} (.+)$', flags=re.IGNORECASE|re.MULTILINE)
  36. # matcher = re.compile(r'[Gg]aybie[s]? [Nn]om')
  37. if re.search(matcher, r.text) != None:
  38. print("Page {} has a nomination.".format(i))
  39. soup = BeautifulSoup(r.text, 'html.parser')
  40. for tag in soup.find_all('tr'):
  41. keep = False
  42. latestimg = ""
  43. for child in tag.descendants:
  44. #if child.name == "img":
  45. # lastimg = child['src']
  46. res = re.search(matcher, str(child))
  47. if res != None:
  48. # out = "{}: {}".format(res.group(1), lastimg)
  49. # print(out)
  50. # with open("nominations.txt", "a") as file:
  51. # file.write(out + "\n")
  52. keep = True
  53. if keep == False:
  54. tag.decompose()
  55. with open("pages/page{}.html".format(i), "w") as file:
  56. file.write(str(soup))
  57. else:
  58. print("Page {} has no nominations.".format(i))
  59. i += 1
  60. config["DEFAULT"]["lastpage"] = str(i)
  61. with open("config.ini", "w") as file:
  62. config.write(file)