Parcourir la source

Update main & requirements

main
Noëlle il y a 2 ans
Parent
révision
8e926eb990
Aucune clé connue n'a été trouvée dans la base pour cette signature
2 fichiers modifiés avec 60 ajouts et 24 suppressions
  1. 55
    24
      main.py
  2. 5
    0
      requirements.txt

+ 55
- 24
main.py Voir le fichier

import requests, time, re, os, configparser, sys, argparse, shutil import requests, time, re, os, configparser, sys, argparse, shutil
from bs4 import BeautifulSoup from bs4 import BeautifulSoup


def main(args):
def main(args: dict) -> None:
print(f"Fetching PDFs.") print(f"Fetching PDFs.")
if not os.path.isdir("pdfs"): if not os.path.isdir("pdfs"):
print("First-time setup...")
print("Setting up PDFs directory.")
os.mkdir("pdfs") os.mkdir("pdfs")
if args.save_pages and not os.path.isdir("html"):
print("Setting up HTML directory.")
os.mkdir("html")
config = configparser.ConfigParser(interpolation=None) config = configparser.ConfigParser(interpolation=None)
if not os.path.isfile('config.ini'): if not os.path.isfile('config.ini'):
"action": "login" "action": "login"
} }


s = requests.Session()
q = s.post("{}login".format(INFOVAYA_ROOT), data=info)
q.raise_for_status()
infovaya_session = requests.Session()
infovaya_creds_request = infovaya_session.post(f"{INFOVAYA_ROOT}login", data=info)
infovaya_creds_request.raise_for_status()


sessions_request = s.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
sessions_request = infovaya_session.get("{}event?id=88&actionMenu=sessions".format(INFOVAYA_ROOT))
sessions_request.raise_for_status() sessions_request.raise_for_status()
sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser") sessions_page_soup = BeautifulSoup(sessions_request.text, "html.parser")
if args.save_pages:
with open("html/main.html", "w+", encoding="utf-8") as file:
file.write(sessions_page_soup.prettify())
print("Starting to search for PDFs. This may take a while...") print("Starting to search for PDFs. This may take a while...")
for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")): for session_link in sessions_page_soup.find_all("a", href=re.compile("session\?id")):
time.sleep(0.05)
page_request = s.get("{}{}".format(INFOVAYA_ROOT, session_link["href"]))
page_request.raise_for_status()
page_soup = BeautifulSoup(page_request.text, "html.parser")
page_title = page.soup.h1.string
for pdf_tag in page_soup.find_all("a", href=re.compile("pdfviewer")):
pdf_name = pdf_tag["href"].split("/")[-1]
file_name = "{}-{}".format(page_title.replace(" ", "_"), pdf_name)
pdf_request = s.get("{}{}".format(INFOVAYA_ROOT, pdf_tag["href"]), stream=True)
pdf_request.raise_for_status()
print("Fetching PDF for '{}'...".format(page_title))
with open(file_name, "wb") as file:
pdf_request.raw.decode_content = True
shutil.copyfileobj(pdf_request.raw, file)
print("Saved as pdfs/{}".format(file_name))
session_href = session_link["href"][1:]
page_name = f"session_{session_href.split('=')[-1]}.html"
if not os.path.isfile(f"html/{page_name}"):
time.sleep(1)
print(f"Fetching {session_href} ... ", end="")
page_request = infovaya_session.get(f"{INFOVAYA_ROOT}{session_href}")
page_request.raise_for_status()
print("Success! Finding presentations... ", end="")
page_soup = BeautifulSoup(page_request.text, "html.parser")
if args.save_pages:
with open(f"html/{page_name}", "w+", encoding="utf-8") as file:
file.write(page_soup.prettify())
print(f"Wrote to file html/{page_name}.")
else:
print()
# import pdb; pdb.set_trace()
for presentation_link in page_soup.find_all("a", href=re.compile("presentation\?id")):
presentation_href = presentation_link["href"][1:]
pres_page_name = f"pres_{presentation_href.split('=')[-1]}.html"
if not os.path.isfile(f"html/{pres_page_name}"):
print(f" Fetching {presentation_href} ...", end="")
pres_request = infovaya_session.get(f"{INFOVAYA_ROOT}{presentation_href}")
pres_request.raise_for_status()
print("Success! ", end="")
pres_page_soup = BeautifulSoup(pres_request.text, "html.parser")
if args.save_pages:
with open(f"html/{pres_page_name}", "w+", encoding="utf-8") as file:
file.write(pres_page_soup.prettify())
pres_page_title = pres_page_soup.h1.string
print(f"Found '{pres_page_title}'. Finding PDF...")
for pdf_tag in pres_page_soup.find_all("a", href=re.compile("pdfviewer")):
pdf_name = pdf_tag["href"].split("/")[-1]
file_name = f"{pres_page_title.replace(' ', '_')}-{pdf_name}"
pdf_request = infovaya_session.get(f"{INFOVAYA_ROOT}{pdf_tag['href'][1:]}")
pdf_request.raise_for_status()
print(f"Fetching PDF for '{pres_page_title}'...")
with open(f"pdfs/{file_name}", "wb") as file:
file.write(pdf_request.content)
print(f"Saved as pdfs/{file_name}")
time.sleep(2)
print("Finished fetching thread.")


if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("thread", action="store", help="The threadid from the thread's URL")
parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
parser.add_argument("-s", "--save-pages", action="store_true", help="Save HTML of ")
# parser.add_argument("-i", "--images", action="store_true", help="Set this flag to download images as well as HTML.\nNOTE: This may be VERY bandwidth and disk intensive!")
args = parser.parse_args() args = parser.parse_args()
print(args)
main(args) main(args)

+ 5
- 0
requirements.txt Voir le fichier

beautifulsoup4==4.11.1 beautifulsoup4==4.11.1
bs4==0.0.1 bs4==0.0.1
certifi==2022.6.15
charset-normalizer==2.0.12
click==8.1.3 click==8.1.3
idna==3.3
mypy-extensions==0.4.3 mypy-extensions==0.4.3
pathspec==0.9.0 pathspec==0.9.0
platformdirs==2.5.2 platformdirs==2.5.2
requests==2.28.0
soupsieve==2.3.2.post1 soupsieve==2.3.2.post1
tomli==2.0.1 tomli==2.0.1
urllib3==1.26.9

Chargement…
Annuler
Enregistrer