eshen
/
BudgetX


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							import sys
import camelot
import pandas as pd

def extract_table_headings(tables):
    # Iterate over each extracted table
    for i, table in enumerate(tables):
        # Get the dataframe from the table
        df = table.df
        
        # Print the headings (column names) of the dataframe
        # print(f"Table {i+1} Headings:")
        # print(df.columns.tolist())
        print(f"Table {i}:")
        print (df)
        print("\nColumn 1:")
        print(df[1])
        print("\nSorted by Column 1:")
        sorted = df.sort_values(1)
        print(sorted[1])
        print("\n\n")

def build_set(tables, column=1):
    l = []
    for i, table in enumerate(tables):
        l.extend(table.df[column].tolist())
    # print(f"build_set list: {l}")
    return set(l)

def main(pdf_path1 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-新竹基地21010-TR-新竹-比對.pdf",
         pdf_path2 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-臺東基地21010-TR-臺東-比對.pdf"):
    # Extract and print table headings for the first PDF
    #print(f"Extracting tables from {pdf_path1}")
    # Read the PDF file
    tables1 = camelot.read_pdf(pdf_path1, pages='all')
    print(f"{tables1.n} Tables read from {pdf_path1}")
    extract_table_headings(tables1)
    
    # Extract and print table headings for the second PDF
    #print(f"\nExtracting tables from {pdf_path2}")
    # Read the PDF file
    tables2 = camelot.read_pdf(pdf_path2, pages='all')
    print(f"{tables2.n} Tables read from {pdf_path2}")
    # extract_table_headings(tables2)

    #set1 = set(tables1[0].df[1].tolist())
    #set2 = set(tables2[0].df[1].tolist())
    set1 = build_set(tables1)
    set2 = build_set(tables2)

    print("\n\n")
    print(f"兩個表格共通的項目： \n{set1.intersection(set2)}\n")
    print(f"第一個表格獨有的項目: \n{set1-set2}\n")
    print(f"第二個表格獨有的項目: \n{set2-set1}\n")
    print("\n\n")
    
    #print(f"Difference between the table contents: {sym_diff}")

if __name__ == "__main__":
    args = sys.argv[1:]
    if len(args) == 2:
        main(args[0], args[1])
    else:
        print("參數的數目不正確。\n\npython PDFTableComparison.py 檔案1.pdf 檔案2.pdf\n比較兩個 PDF 檔中指定的欄位內容。\n\n")
        main()