User Reconciliation Automation with Python and ChatGPT
-
Every month I am faced with the tedious task of reconciling user accounts from multiple CSV files. The process involved manually scanning each file for email columns, and then consolidating the data into a single file for analysis.
I knew there had to be a better way, and I was determined to find it. After doing some research, I discovered that I could use Python to automate the task.
To help me write the code, I used ChatGPT, a powerful language model that could generate code based on natural language descriptions. With ChatGPT's assistance, I was able to quickly and easily develop a script that could accomplish my desired task.
The script saved me an hour of work on its first run.
This is amazing. If you want it, lemme know. I can share it.
-
Excellent work, I've been meaning to do that for years...
-
Updated the script to include the source header names in the output!
import os import csv import re import pandas as pd # Set the path to the folder containing the CSV files path = os.path.dirname(__file__) # Initialize a list to store the data from each CSV file data = [] # Iterate through each file in the folder for filename in os.listdir(path): # Check if the file is a CSV file if filename.endswith('.csv'): # Open the file and read the data with open(os.path.join(path, filename), 'r') as f: reader = csv.reader(f) file_data = list(reader) # Store the file data in a list # Check if the first row starts with "#TYPE", if so, skip it if file_data[0][0].startswith("#TYPE"): file_data = file_data[1:] # Get the headers headers = file_data[0] # Initialize a list to store the email column indices email_columns = [] # Iterate through the columns to find the email columns for i, header in enumerate(headers): column_data = [row[i] for row in file_data[1:]] # Check if any of the values in the column are valid email addresses for value in column_data: if re.match(r'[^@]+@[^@]+\.[^@]+', value): email_columns.append(i) break # No need to check the rest of the values in this column # If there is more than one email column, prompt the user to choose one if len(email_columns) > 1: print(f'Multiple email columns found in {filename}:') for i, header in enumerate(headers): if i in email_columns: print(f'{i}: {header}') email_column = int(input('Please select the email column by entering the corresponding number: ')) elif len(email_columns) == 1: # If there is only one email column, use that one email_column = email_columns[0] else: # If there are no email columns, skip this file print(f'No email columns found in {filename}, skipping...') continue # Extract the email column data from the file file_data = [[row[email_column], f'{filename}-{headers[email_column]}', filename] for row in file_data[1:]] # Add the data from this file to the list data.extend(file_data) # Create a results subfolder if it doesn't already exist results_path = os.path.join(path, 'results') if not os.path.exists(results_path): os.makedirs(results_path) #Write the data to a CSV file in the results folder with open(os.path.join(results_path, 'emails.csv'), 'w', newline='') as f: writer = csv.writer(f) writer.writerows(data) #Read the CSV file into a pandas DataFrame with open(os.path.join(results_path, 'emails.csv'), 'r') as f: first_line = f.readline().strip() if first_line.startswith("#TYPE"): df = pd.read_csv(f, names=['email', 'source', 'filename'], skiprows=[0]) else: df = pd.read_csv(os.path.join(results_path, 'emails.csv'), names=['email', 'source', 'filename']) #Create the pivot table pivot_table = df.pivot_table(index='email', columns='source', aggfunc='size') #Add a column showing the total number of accounts for each email address pivot_table['total'] = pivot_table.sum(axis=1) #Output the pivot table to a CSV file in the results folder pivot_table.to_csv(os.path.join(results_path, 'pivot_table.csv')) print('Done!')