Updated the script to include the source header names in the output!
import os
import csv
import re
import pandas as pd
# Set the path to the folder containing the CSV files
path = os.path.dirname(__file__)
# Initialize a list to store the data from each CSV file
data = []
# Iterate through each file in the folder
for filename in os.listdir(path):
# Check if the file is a CSV file
if filename.endswith('.csv'):
# Open the file and read the data
with open(os.path.join(path, filename), 'r') as f:
reader = csv.reader(f)
file_data = list(reader) # Store the file data in a list
# Check if the first row starts with "#TYPE", if so, skip it
if file_data[0][0].startswith("#TYPE"):
file_data = file_data[1:]
# Get the headers
headers = file_data[0]
# Initialize a list to store the email column indices
email_columns = []
# Iterate through the columns to find the email columns
for i, header in enumerate(headers):
column_data = [row[i] for row in file_data[1:]]
# Check if any of the values in the column are valid email addresses
for value in column_data:
if re.match(r'[^@]+@[^@]+\.[^@]+', value):
email_columns.append(i)
break # No need to check the rest of the values in this column
# If there is more than one email column, prompt the user to choose one
if len(email_columns) > 1:
print(f'Multiple email columns found in {filename}:')
for i, header in enumerate(headers):
if i in email_columns:
print(f'{i}: {header}')
email_column = int(input('Please select the email column by entering the corresponding number: '))
elif len(email_columns) == 1:
# If there is only one email column, use that one
email_column = email_columns[0]
else:
# If there are no email columns, skip this file
print(f'No email columns found in {filename}, skipping...')
continue
# Extract the email column data from the file
file_data = [[row[email_column], f'{filename}-{headers[email_column]}', filename] for row in file_data[1:]]
# Add the data from this file to the list
data.extend(file_data)
# Create a results subfolder if it doesn't already exist
results_path = os.path.join(path, 'results')
if not os.path.exists(results_path):
os.makedirs(results_path)
#Write the data to a CSV file in the results folder
with open(os.path.join(results_path, 'emails.csv'), 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data)
#Read the CSV file into a pandas DataFrame
with open(os.path.join(results_path, 'emails.csv'), 'r') as f:
first_line = f.readline().strip()
if first_line.startswith("#TYPE"):
df = pd.read_csv(f, names=['email', 'source', 'filename'], skiprows=[0])
else:
df = pd.read_csv(os.path.join(results_path, 'emails.csv'), names=['email', 'source', 'filename'])
#Create the pivot table
pivot_table = df.pivot_table(index='email', columns='source', aggfunc='size')
#Add a column showing the total number of accounts for each email address
pivot_table['total'] = pivot_table.sum(axis=1)
#Output the pivot table to a CSV file in the results folder
pivot_table.to_csv(os.path.join(results_path, 'pivot_table.csv'))
print('Done!')