import sys import camelot import pandas as pd def extract_table_headings(tables): # Iterate over each extracted table for i, table in enumerate(tables): # Get the dataframe from the table df = table.df # Print the headings (column names) of the dataframe # print(f"Table {i+1} Headings:") # print(df.columns.tolist()) print(f"Table {i}:") print (df) print("\nColumn 1:") print(df[1]) print("\nSorted by Column 1:") sorted = df.sort_values(1) print(sorted[1]) print("\n\n") def build_set(tables, column=1): l = [] for i, table in enumerate(tables): l.extend(table.df[column].tolist()) # print(f"build_set list: {l}") return set(l) def main(pdf_path1 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-新竹基地21010-TR-新竹-比對.pdf", pdf_path2 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-臺東基地21010-TR-臺東-比對.pdf"): # Extract and print table headings for the first PDF #print(f"Extracting tables from {pdf_path1}") # Read the PDF file tables1 = camelot.read_pdf(pdf_path1, pages='all') print(f"{tables1.n} Tables read from {pdf_path1}") extract_table_headings(tables1) # Extract and print table headings for the second PDF #print(f"\nExtracting tables from {pdf_path2}") # Read the PDF file tables2 = camelot.read_pdf(pdf_path2, pages='all') print(f"{tables2.n} Tables read from {pdf_path2}") # extract_table_headings(tables2) #set1 = set(tables1[0].df[1].tolist()) #set2 = set(tables2[0].df[1].tolist()) set1 = build_set(tables1) set2 = build_set(tables2) print("\n\n") print(f"兩個表格共通的項目: \n{set1.intersection(set2)}\n") print(f"第一個表格獨有的項目: \n{set1-set2}\n") print(f"第二個表格獨有的項目: \n{set2-set1}\n") print("\n\n") #print(f"Difference between the table contents: {sym_diff}") if __name__ == "__main__": args = sys.argv[1:] if len(args) == 2: main(args[0], args[1]) else: print("參數的數目不正確。\n\npython PDFTableComparison.py 檔案1.pdf 檔案2.pdf\n比較兩個 PDF 檔中指定的欄位內容。\n\n") main()