from PyPDF2 import PdfReader import re reader = PdfReader("2020-2021+Undergraduate+Catalog.pdf") text = "" class_titles = set() page_num = 0 for page in reader.pages: page_num = page_num + 1 if page_num >= 701: text += page.extract_text() + "\n" lines = text.splitlines() for line in lines: match = re.match(".+ \d{4} - (.+)", line) if match != None: #print("Found") class_titles.add(match.group(1).strip()) #print(class_titles) with open("classes.txt", "w") as file1: for item in class_titles: file1.write(item) file1.write('\n') file1.close()