noelle
/
pdf-grabber


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
							import requests, time, re, os, configparser, sys
from bs4 import BeautifulSoup

config = configparser.ConfigParser()
if not os.path.isfile('config.ini'):
  print("config.ini is missing!")
  sys.exit(0)
config.read('config.ini')

if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  print("username and password must be present in config.ini.")
  sys.exit(0)

info = { "username": config["DEFAULT"]["username"],
         "password": config["DEFAULT"]["password"],
         "action": "login"
        }

# files = [f for f in os.listdir('pages') if os.path.isfile("./pages/{}".format(f))]

if not os.path.isdir('pages'):
    os.mkdir('pages', 0o755)

s = requests.Session()
q = s.post("https://forums.somethingawful.com/account.php", data=info)
# print(q.text)

if "lastpage" in config["DEFAULT"] and config["DEFAULT"]["lastpage"] != "":
  lastpage = int(config["DEFAULT"]["lastpage"])
else:
  lastpage = 1

i = lastpage
while True:
  time.sleep(0.1)
  payload = {'threadid': '3908778', 'pagenumber': str(i)}
  r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) #, cookies=jar)
  # with open("pages/rawpage{}.txt".format(i), "w+") as file:
  #   file.write(r.text)
  if "The page number you requested" in r.text:
    i -= 1
    break
  matcher = re.compile(r'[g]aybie[s]? [n]om\S{0,} (.+)$', flags=re.IGNORECASE|re.MULTILINE)
  # matcher = re.compile(r'[Gg]aybie[s]? [Nn]om')
  if re.search(matcher, r.text) != None:
    print("Page {} has a nomination.".format(i))
    soup = BeautifulSoup(r.text, 'html.parser')
    for tag in soup.find_all('tr'):
      keep = False
      latestimg = ""
      for child in tag.descendants:
        #if child.name == "img":
        #  lastimg = child['src']
        res = re.search(matcher, str(child))
        if res != None:
          # out = "{}: {}".format(res.group(1), lastimg)
          # print(out)
          # with open("nominations.txt", "a") as file:
          #  file.write(out + "\n")
          keep = True
      if keep == False:
        tag.decompose()
    with open("pages/page{}.html".format(i), "w") as file:
      file.write(str(soup))
  else:
    print("Page {} has no nominations.".format(i))
  i += 1

config["DEFAULT"]["lastpage"] = str(i)
with open("config.ini", "w") as file:
  config.write(file)