| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- import sys
- import camelot
- import pandas as pd
- def extract_table_headings(tables):
- # Iterate over each extracted table
- for i, table in enumerate(tables):
- # Get the dataframe from the table
- df = table.df
-
- # Print the headings (column names) of the dataframe
- # print(f"Table {i+1} Headings:")
- # print(df.columns.tolist())
- print(f"Table {i}:")
- print (df)
- print("\nColumn 1:")
- print(df[1])
- print("\nSorted by Column 1:")
- sorted = df.sort_values(1)
- print(sorted[1])
- print("\n\n")
- def build_set(tables, column=1):
- l = []
- for i, table in enumerate(tables):
- l.extend(table.df[column].tolist())
- # print(f"build_set list: {l}")
- return set(l)
- def main(pdf_path1 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-新竹基地21010-TR-新竹-比對.pdf",
- pdf_path2 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-臺東基地21010-TR-臺東-比對.pdf"):
- # Extract and print table headings for the first PDF
- #print(f"Extracting tables from {pdf_path1}")
- # Read the PDF file
- tables1 = camelot.read_pdf(pdf_path1, pages='all')
- print(f"{tables1.n} Tables read from {pdf_path1}")
- extract_table_headings(tables1)
-
- # Extract and print table headings for the second PDF
- #print(f"\nExtracting tables from {pdf_path2}")
- # Read the PDF file
- tables2 = camelot.read_pdf(pdf_path2, pages='all')
- print(f"{tables2.n} Tables read from {pdf_path2}")
- # extract_table_headings(tables2)
- #set1 = set(tables1[0].df[1].tolist())
- #set2 = set(tables2[0].df[1].tolist())
- set1 = build_set(tables1)
- set2 = build_set(tables2)
- print("\n\n")
- print(f"兩個表格共通的項目: \n{set1.intersection(set2)}\n")
- print(f"第一個表格獨有的項目: \n{set1-set2}\n")
- print(f"第二個表格獨有的項目: \n{set2-set1}\n")
- print("\n\n")
-
- #print(f"Difference between the table contents: {sym_diff}")
- if __name__ == "__main__":
- args = sys.argv[1:]
- if len(args) == 2:
- main(args[0], args[1])
- else:
- print("參數的數目不正確。\n\npython PDFTableComparison.py 檔案1.pdf 檔案2.pdf\n比較兩個 PDF 檔中指定的欄位內容。\n\n")
- main()
|