3 年之前 · ade86eca55
--- a/config.ini
+++ b/config.ini
@@ -0,0 +1,3 @@
 [DEFAULT]
 username = 
 password =
--- a/main.py
+++ b/main.py
@@ -0,0 +1,144 @@
 import requests, time, re, os, configparser, sys, argparse
 from bs4 import BeautifulSoup
 from PIL import Image
 from io import BytesIO

 css_to_change = ["https://www.somethingawful.com/css/main.css?12",
                 "https://forums.somethingawful.com/css/bbcode.css?1456974408",
                 "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
                 "https://www.somethingawful.com/css/globalmenu.css",
                 "https://www.somethingawful.com/css/forums.css?1545838155"
                ]
 css_to_change_to = ["main.css",
                    "bbcode.css",
                    "jquery-ui.min.css",
                    "globalmenu.css",
                    "forums.css"
                   ]

 scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
                     "https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
                     "https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
                     "https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
                     "https://twemoji.maxcdn.com/2/twemoji.min.js",
                    ]
 scripts_to_change_to = ["jquery.min.js",
                        "jquery-migrate.min.js",
                        "jquery-ui.min.js",
                        "forums-combined.js",
                        "twemoji.min.js"
                       ]


 def main(args):
  print(f"Fetching from thread {args.thread}.")
  if not os.path.isdir("archive"):
    print("First-time setup...")
    os.mkdir("archive")
  if not os.path.isdir("archive/css"):
    print("Setting up CSS...")
    os.mkdir("archive/css")
    for f in range(len(css_to_change)):
      r = requests.get(css_to_change[f])
      with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
        file.write(r.text)
  if not os.path.isdir("archive/scripts"):
    print("Setting up scripts...")
    os.mkdir("archive/scripts")
    for f in range(len(scripts_to_change)):
      r = requests.get(scripts_to_change[f])
      with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
        file.write(r.text)
    
  if not os.path.isdir(f"archive/{args.thread}"):
    print(f"Creating directory for {args.thread}...")
    os.mkdir(f"archive/{args.thread}")
  if not os.path.isdir(f"archive/{args.thread}/images"):
    print(f"Creating directory for {args.thread}/images...")  
    os.mkdir(f"archive/{args.thread}/images")
  config = configparser.ConfigParser(interpolation=None)
  if not os.path.isfile('config.ini'):
    print("config.ini is missing!")
    sys.exit(0)
  config.read('config.ini')

  if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
    print("username and password must be present in config.ini.")
    sys.exit(0)

  info = { "username": config["DEFAULT"]["username"],
          "password": config["DEFAULT"]["password"],
          "action": "login"
          }

  s = requests.Session()
  q = s.post("https://forums.somethingawful.com/account.php", data=info)

  if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
    lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
  else:
    lastpage = 1

  i = lastpage
  parse_ok = True
  while True:
    time.sleep(0.05)
    payload = {'threadid': args.thread, 'pagenumber': str(i)}
    r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
    if "Specified thread was not found in the live forums." in r.text:
      print("That thread does not exist or is not accessible to you.")
      parse_ok = False
      break
    if "The page number you requested" in r.text:
      i -= 1
      break
    print(f"Fetching page {i} in thread {args.thread}.")
    with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
      soup = BeautifulSoup(r.text, "html.parser")
      for tag in soup.find_all("link",{"href":True}):
        if tag["href"] in css_to_change:
          tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
      for tag in soup.find_all("script",{"src":True}):
        if tag["src"] in scripts_to_change:
          tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
      for tag in soup.find_all("a",{"title":True}):
        if tag["title"] == "Next page":
          tag["href"] = f"page{i+1}.html"
        if tag["title"] == "Previous page":
          tag["href"] = f"page{i-1}.html"
      if args.images:
        for tag in soup.find_all("img",{"src":True}):
          src = tag["src"]
          if src[:4] != "http":
            src = "https:" + src
          imgname = src.split("/")[-1]
          fullpath = f"archive/{args.thread}/images/{imgname}"
          if os.path.isfile(fullpath):
            tag["src"] = f"images/{imgname}"
          else:
            img = s.get(src, stream=True)
            if img.status_code == 200:
              try:
                theimage = Image.open(BytesIO(img.content))
                print(f"\tSaving {fullpath}.")
                theimage.save(fullpath)
                tag["src"] = f"images/{imgname}"
              except:
                print(f"\tImage {src} not available.")
            else:
              print(f"\tImage {src} not available.")
      file.write(soup.prettify())
    i += 1

  print("Finished fetching thread.")

  config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
  with open("config.ini", "w") as file:
    config.write(file)

 if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
  parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
  args = parser.parse_args()
  main(args)
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,16 @@
 beautifulsoup4==4.9.1
 bs4==0.0.1
 certifi==2020.6.20
 chardet==3.0.4
 click==7.1.2
 Flask==1.1.2
 idna==2.9
 itsdangerous==1.1.0
 Jinja2==2.11.3
 MarkupSafe==1.1.1
 pillow>=8.3.2
 psycopg2==2.8.5
 requests==2.24.0
 soupsieve==2.0.1
 urllib3>=1.26.5
 Werkzeug==1.0.1
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,71 @@
 import requests, time, re, os, configparser, sys
 from bs4 import BeautifulSoup

 config = configparser.ConfigParser()
 if not os.path.isfile('config.ini'):
  print("config.ini is missing!")
  sys.exit(0)
 config.read('config.ini')

 if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
  print("username and password must be present in config.ini.")
  sys.exit(0)

 info = { "username": config["DEFAULT"]["username"],
         "password": config["DEFAULT"]["password"],
         "action": "login"
        }

 # files = [f for f in os.listdir('pages') if os.path.isfile("./pages/{}".format(f))]

 if not os.path.isdir('pages'):
    os.mkdir('pages', 0o755)

 s = requests.Session()
 q = s.post("https://forums.somethingawful.com/account.php", data=info)
 # print(q.text)

 if "lastpage" in config["DEFAULT"] and config["DEFAULT"]["lastpage"] != "":
  lastpage = int(config["DEFAULT"]["lastpage"])
 else:
  lastpage = 1

 i = lastpage
 while True:
  time.sleep(0.1)
  payload = {'threadid': '3908778', 'pagenumber': str(i)}
  r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) #, cookies=jar)
  # with open("pages/rawpage{}.txt".format(i), "w+") as file:
  #   file.write(r.text)
  if "The page number you requested" in r.text:
    i -= 1
    break
  matcher = re.compile(r'[g]aybie[s]? [n]om\S{0,} (.+)$', flags=re.IGNORECASE|re.MULTILINE)
  # matcher = re.compile(r'[Gg]aybie[s]? [Nn]om')
  if re.search(matcher, r.text) != None:
    print("Page {} has a nomination.".format(i))
    soup = BeautifulSoup(r.text, 'html.parser')
    for tag in soup.find_all('tr'):
      keep = False
      latestimg = ""
      for child in tag.descendants:
        #if child.name == "img":
        #  lastimg = child['src']
        res = re.search(matcher, str(child))
        if res != None:
          # out = "{}: {}".format(res.group(1), lastimg)
          # print(out)
          # with open("nominations.txt", "a") as file:
          #  file.write(out + "\n")
          keep = True
      if keep == False:
        tag.decompose()
    with open("pages/page{}.html".format(i), "w") as file:
      file.write(str(soup))
  else:
    print("Page {} has no nominations.".format(i))
  i += 1

 config["DEFAULT"]["lastpage"] = str(i)
 with open("config.ini", "w") as file:
  config.write(file)