For grabbing PDFs from ICRA 2022
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import requests, time, re, os, configparser, sys
  2. from bs4 import BeautifulSoup
  3. config = configparser.ConfigParser()
  4. if not os.path.isfile('config.ini'):
  5. print("config.ini is missing!")
  6. sys.exit(0)
  7. config.read('config.ini')
  8. if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  9. print("username and password must be present in config.ini.")
  10. sys.exit(0)
  11. info = { "username": config["DEFAULT"]["username"],
  12. "password": config["DEFAULT"]["password"],
  13. "action": "login"
  14. }
  15. # files = [f for f in os.listdir('pages') if os.path.isfile("./pages/{}".format(f))]
  16. if not os.path.isdir('pages'):
  17. os.mkdir('pages', 0o755)
  18. s = requests.Session()
  19. q = s.post("https://forums.somethingawful.com/account.php", data=info)
  20. # print(q.text)
  21. if "lastpage" in config["DEFAULT"] and config["DEFAULT"]["lastpage"] != "":
  22. lastpage = int(config["DEFAULT"]["lastpage"])
  23. else:
  24. lastpage = 1
  25. i = lastpage
  26. while True:
  27. time.sleep(0.1)
  28. payload = {'threadid': '3908778', 'pagenumber': str(i)}
  29. r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) #, cookies=jar)
  30. # with open("pages/rawpage{}.txt".format(i), "w+") as file:
  31. # file.write(r.text)
  32. if "The page number you requested" in r.text:
  33. i -= 1
  34. break
  35. matcher = re.compile(r'[g]aybie[s]? [n]om\S{0,} (.+)$', flags=re.IGNORECASE|re.MULTILINE)
  36. # matcher = re.compile(r'[Gg]aybie[s]? [Nn]om')
  37. if re.search(matcher, r.text) != None:
  38. print("Page {} has a nomination.".format(i))
  39. soup = BeautifulSoup(r.text, 'html.parser')
  40. for tag in soup.find_all('tr'):
  41. keep = False
  42. latestimg = ""
  43. for child in tag.descendants:
  44. #if child.name == "img":
  45. # lastimg = child['src']
  46. res = re.search(matcher, str(child))
  47. if res != None:
  48. # out = "{}: {}".format(res.group(1), lastimg)
  49. # print(out)
  50. # with open("nominations.txt", "a") as file:
  51. # file.write(out + "\n")
  52. keep = True
  53. if keep == False:
  54. tag.decompose()
  55. with open("pages/page{}.html".format(i), "w") as file:
  56. file.write(str(soup))
  57. else:
  58. print("Page {} has no nominations.".format(i))
  59. i += 1
  60. config["DEFAULT"]["lastpage"] = str(i)
  61. with open("config.ini", "w") as file:
  62. config.write(file)