Browse Source

Update main & requirements

main
Noëlle 1 year ago
parent
commit
8e926eb990
No known key found for this signature in database
2 changed files with 60 additions and 24 deletions
  1. 55
    24
      main.py
  2. 5
    0
      requirements.txt

+ 55
- 24
main.py View File

@@ -1,11 +1,14 @@
import requests, time, re, os, configparser, sys, argparse, shutil
from bs4 import BeautifulSoup

def main(args):
def main(args: dict) -> None:
print(f"Fetching PDFs.")
if not os.path.isdir("pdfs"):
print("First-time setup...")
print("Setting up PDFs directory.")
os.mkdir("pdfs")
if args.save_pages and not os.path.isdir("html"):
print("Setting up HTML directory.")
os.mkdir("html")
config = configparser.ConfigParser(interpolation=None)
if not os.path.isfile('config.ini'):
@@ -24,36 +27,64 @@ def main(args):
"action": "login"
}

s = requests.Session()
q = s.post("{}login".format(INFOVAYA_ROOT), data=info)
q.raise_for_status()
infovaya_session = requests.Session()
infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
infovaya_creds_request.raise_for_status()

sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
sessions_request = infovaya_session.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
sessions_request.raise_for_status()
sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
if args.save_pages:
with open("html/main.html", "w+", encoding="utf-8") as file:
file.write(sessions_page_soup.prettify())
print("Starting to search for PDFs. This may take a while...")
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
time.sleep(0.05)
page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"]))
page_request.raise_for_status()
page_soup = BeautifulSoup(page_request.text, "html.parser")
page_title = page.soup.h1.string
for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")):
pdf_name = pdf_tag["href"].split("/")[-1]
file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name)
pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True)
pdf_request.raise_for_status()
print("Fetching PDF for '{}'...".format(page_title))
with open(file_name, "wb") as file:
pdf_request.raw.decode_content = True
shutil.copyfileobj(pdf_request.raw, file)
print("Saved as pdfs/{}".format(file_name))
session_href = session_link["href"][1:]
page_name = f"session_{session_href.split('=')[-1]}.html"
if not os.path.isfile(f"html/{page_name}"):
time.sleep(1)
print(f"Fetching {session_href} ... ", end="")
page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
page_request.raise_for_status()
print("Success! Finding presentations... ", end="")
page_soup = BeautifulSoup(page_request.text, "html.parser")
if args.save_pages:
with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
file.write(page_soup.prettify())
print(f"Wrote to file html/{page_name}.")
else:
print()
# import pdb; pdb.set_trace()
for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
presentation_href = presentation_link["href"][1:]
pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html"
if not os.path.isfile(f"html/{pres_page_name}"):
print(f" Fetching {presentation_href} ...", end="")
pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
pres_request.raise_for_status()
print("Success! ", end="")
pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
if args.save_pages:
with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
file.write(pres_page_soup.prettify())
pres_page_title = pres_page_soup.h1.string
print(f"Found '{pres_page_title}'. Finding PDF...")
for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
pdf_name = pdf_tag["href"].split("/")[-1]
file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}"
pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
pdf_request.raise_for_status()
print(f"Fetching PDF for '{pres_page_title}'...")
with open(f"pdfs/{file_name}", "wb") as file:
file.write(pdf_request.content)
print(f"Saved as pdfs/{file_name}")
time.sleep(2)
print("Finished fetching thread.")

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of ")
# parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
args = parser.parse_args()
print(args)
main(args)

+ 5
- 0
requirements.txt View File

@@ -1,8 +1,13 @@
beautifulsoup4==4.11.1
bs4==0.0.1
certifi==2022.6.15
charset-normalizer==2.0.12
click==8.1.3
idna==3.3
mypy-extensions==0.4.3
pathspec==0.9.0
platformdirs==2.5.2
requests==2.28.0
soupsieve==2.3.2.post1
tomli==2.0.1
urllib3==1.26.9

Loading…
Cancel
Save