Browse Source

Initialize

master
Noëlle 1 year ago
parent
commit
ade86eca55
No known key found for this signature in database
4 changed files with 234 additions and 0 deletions
  1. 3
    0
      config.ini
  2. 144
    0
      main.py
  3. 16
    0
      requirements.txt
  4. 71
    0
      scraper.py

+ 3
- 0
config.ini View File

@@ -0,0 +1,3 @@
[DEFAULT]
username =
password =

+ 144
- 0
main.py View File

@@ -0,0 +1,144 @@
import requests, time, re, os, configparser, sys, argparse
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

css_to_change = ["https://www.somethingawful.com/css/main.css?12",
"https://forums.somethingawful.com/css/bbcode.css?1456974408",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/redmond/jquery-ui.min.css",
"https://www.somethingawful.com/css/globalmenu.css",
"https://www.somethingawful.com/css/forums.css?1545838155"
]
css_to_change_to = ["main.css",
"bbcode.css",
"jquery-ui.min.css",
"globalmenu.css",
"forums.css"
]

scripts_to_change = ["https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js",
"https://cdnjs.cloudflare.com/ajax/libs/jquery-migrate/1.4.0/jquery-migrate.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/jquery-ui.min.js",
"https://forums.somethingawful.com/js/vb/forums.combined.js?1476414227",
"https://twemoji.maxcdn.com/2/twemoji.min.js",
]
scripts_to_change_to = ["jquery.min.js",
"jquery-migrate.min.js",
"jquery-ui.min.js",
"forums-combined.js",
"twemoji.min.js"
]


def main(args):
print(f"Fetching from thread {args.thread}.")
if not os.path.isdir("archive"):
print("First-time setup...")
os.mkdir("archive")
if not os.path.isdir("archive/css"):
print("Setting up CSS...")
os.mkdir("archive/css")
for f in range(len(css_to_change)):
r = requests.get(css_to_change[f])
with open(f"archive/css/{css_to_change_to[f]}", "w+") as file:
file.write(r.text)
if not os.path.isdir("archive/scripts"):
print("Setting up scripts...")
os.mkdir("archive/scripts")
for f in range(len(scripts_to_change)):
r = requests.get(scripts_to_change[f])
with open(f"archive/scripts/{scripts_to_change_to[f]}", "w+") as file:
file.write(r.text)
if not os.path.isdir(f"archive/{args.thread}"):
print(f"Creating directory for {args.thread}...")
os.mkdir(f"archive/{args.thread}")
if not os.path.isdir(f"archive/{args.thread}/images"):
print(f"Creating directory for {args.thread}/images...")
os.mkdir(f"archive/{args.thread}/images")
config = configparser.ConfigParser(interpolation=None)
if not os.path.isfile('config.ini'):
print("config.ini is missing!")
sys.exit(0)
config.read('config.ini')

if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
print("username and password must be present in config.ini.")
sys.exit(0)

info = { "username": config["DEFAULT"]["username"],
"password": config["DEFAULT"]["password"],
"action": "login"
}

s = requests.Session()
q = s.post("https://forums.somethingawful.com/account.php", data=info)

if f"lastpage{args.thread}" in config["DEFAULT"] and config["DEFAULT"][f"lastpage{args.thread}"] != "":
lastpage = int(config["DEFAULT"][f"lastpage{args.thread}"])
else:
lastpage = 1

i = lastpage
parse_ok = True
while True:
time.sleep(0.05)
payload = {'threadid': args.thread, 'pagenumber': str(i)}
r = s.get("https://forums.somethingawful.com/showthread.php", params=payload)
if "Specified thread was not found in the live forums." in r.text:
print("That thread does not exist or is not accessible to you.")
parse_ok = False
break
if "The page number you requested" in r.text:
i -= 1
break
print(f"Fetching page {i} in thread {args.thread}.")
with open(f"archive/{args.thread}/page{i}.html", "w+", encoding="utf-8") as file:
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup.find_all("link",{"href":True}):
if tag["href"] in css_to_change:
tag["href"] = "../css/" + css_to_change_to[css_to_change.index(tag["href"])]
for tag in soup.find_all("script",{"src":True}):
if tag["src"] in scripts_to_change:
tag["src"] = "../scripts/" + scripts_to_change_to[scripts_to_change.index(tag["src"])]
for tag in soup.find_all("a",{"title":True}):
if tag["title"] == "Next page":
tag["href"] = f"page{i+1}.html"
if tag["title"] == "Previous page":
tag["href"] = f"page{i-1}.html"
if args.images:
for tag in soup.find_all("img",{"src":True}):
src = tag["src"]
if src[:4] != "http":
src = "https:" + src
imgname = src.split("/")[-1]
fullpath = f"archive/{args.thread}/images/{imgname}"
if os.path.isfile(fullpath):
tag["src"] = f"images/{imgname}"
else:
img = s.get(src, stream=True)
if img.status_code == 200:
try:
theimage = Image.open(BytesIO(img.content))
print(f"\tSaving {fullpath}.")
theimage.save(fullpath)
tag["src"] = f"images/{imgname}"
except:
print(f"\tImage {src} not available.")
else:
print(f"\tImage {src} not available.")
file.write(soup.prettify())
i += 1

print("Finished fetching thread.")

config["DEFAULT"][f"lastpage{args.thread}"] = str(i)
with open("config.ini", "w") as file:
config.write(file)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
args = parser.parse_args()
main(args)

+ 16
- 0
requirements.txt View File

@@ -0,0 +1,16 @@
beautifulsoup4==4.9.1
bs4==0.0.1
certifi==2020.6.20
chardet==3.0.4
click==7.1.2
Flask==1.1.2
idna==2.9
itsdangerous==1.1.0
Jinja2==2.11.3
MarkupSafe==1.1.1
pillow>=8.3.2
psycopg2==2.8.5
requests==2.24.0
soupsieve==2.0.1
urllib3>=1.26.5
Werkzeug==1.0.1

+ 71
- 0
scraper.py View File

@@ -0,0 +1,71 @@
import requests, time, re, os, configparser, sys
from bs4 import BeautifulSoup

config = configparser.ConfigParser()
if not os.path.isfile('config.ini'):
print("config.ini is missing!")
sys.exit(0)
config.read('config.ini')

if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
print("username and password must be present in config.ini.")
sys.exit(0)

info = { "username": config["DEFAULT"]["username"],
"password": config["DEFAULT"]["password"],
"action": "login"
}

# files = [f for f in os.listdir('pages') if os.path.isfile("./pages/{}".format(f))]

if not os.path.isdir('pages'):
os.mkdir('pages', 0o755)

s = requests.Session()
q = s.post("https://forums.somethingawful.com/account.php", data=info)
# print(q.text)

if "lastpage" in config["DEFAULT"] and config["DEFAULT"]["lastpage"] != "":
lastpage = int(config["DEFAULT"]["lastpage"])
else:
lastpage = 1

i = lastpage
while True:
time.sleep(0.1)
payload = {'threadid': '3908778', 'pagenumber': str(i)}
r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) #, cookies=jar)
# with open("pages/rawpage{}.txt".format(i), "w+") as file:
# file.write(r.text)
if "The page number you requested" in r.text:
i -= 1
break
matcher = re.compile(r'[g]aybie[s]? [n]om\S{0,} (.+)$', flags=re.IGNORECASE|re.MULTILINE)
# matcher = re.compile(r'[Gg]aybie[s]? [Nn]om')
if re.search(matcher, r.text) != None:
print("Page {} has a nomination.".format(i))
soup = BeautifulSoup(r.text, 'html.parser')
for tag in soup.find_all('tr'):
keep = False
latestimg = ""
for child in tag.descendants:
#if child.name == "img":
# lastimg = child['src']
res = re.search(matcher, str(child))
if res != None:
# out = "{}: {}".format(res.group(1), lastimg)
# print(out)
# with open("nominations.txt", "a") as file:
# file.write(out + "\n")
keep = True
if keep == False:
tag.decompose()
with open("pages/page{}.html".format(i), "w") as file:
file.write(str(soup))
else:
print("Page {} has no nominations.".format(i))
i += 1

config["DEFAULT"]["lastpage"] = str(i)
with open("config.ini", "w") as file:
config.write(file)

Loading…
Cancel
Save