PDFTableComparison.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import sys
  2. import camelot
  3. import pandas as pd
  4. def extract_table_headings(tables):
  5. # Iterate over each extracted table
  6. for i, table in enumerate(tables):
  7. # Get the dataframe from the table
  8. df = table.df
  9. # Print the headings (column names) of the dataframe
  10. # print(f"Table {i+1} Headings:")
  11. # print(df.columns.tolist())
  12. print(f"Table {i}:")
  13. print (df)
  14. print("\nColumn 1:")
  15. print(df[1])
  16. print("\nSorted by Column 1:")
  17. sorted = df.sort_values(1)
  18. print(sorted[1])
  19. print("\n\n")
  20. def build_set(tables, column=1):
  21. l = []
  22. for i, table in enumerate(tables):
  23. l.extend(table.df[column].tolist())
  24. # print(f"build_set list: {l}")
  25. return set(l)
  26. def main(pdf_path1 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-新竹基地21010-TR-新竹-比對.pdf",
  27. pdf_path2 = "D:\\Eugene Shen\\運輸及土木 Samples\\(預算書)空軍E001工程-臺東基地21010-TR-臺東-比對.pdf"):
  28. # Extract and print table headings for the first PDF
  29. #print(f"Extracting tables from {pdf_path1}")
  30. # Read the PDF file
  31. tables1 = camelot.read_pdf(pdf_path1, pages='all')
  32. print(f"{tables1.n} Tables read from {pdf_path1}")
  33. extract_table_headings(tables1)
  34. # Extract and print table headings for the second PDF
  35. #print(f"\nExtracting tables from {pdf_path2}")
  36. # Read the PDF file
  37. tables2 = camelot.read_pdf(pdf_path2, pages='all')
  38. print(f"{tables2.n} Tables read from {pdf_path2}")
  39. # extract_table_headings(tables2)
  40. #set1 = set(tables1[0].df[1].tolist())
  41. #set2 = set(tables2[0].df[1].tolist())
  42. set1 = build_set(tables1)
  43. set2 = build_set(tables2)
  44. print("\n\n")
  45. print(f"兩個表格共通的項目: \n{set1.intersection(set2)}\n")
  46. print(f"第一個表格獨有的項目: \n{set1-set2}\n")
  47. print(f"第二個表格獨有的項目: \n{set2-set1}\n")
  48. print("\n\n")
  49. #print(f"Difference between the table contents: {sym_diff}")
  50. if __name__ == "__main__":
  51. args = sys.argv[1:]
  52. if len(args) == 2:
  53. main(args[0], args[1])
  54. else:
  55. print("參數的數目不正確。\n\npython PDFTableComparison.py 檔案1.pdf 檔案2.pdf\n比較兩個 PDF 檔中指定的欄位內容。\n\n")
  56. main()