Skip to content

Commit

Permalink
updated dopo init files
Browse files Browse the repository at this point in the history
  • Loading branch information
cafriedb committed Aug 23, 2024
1 parent ee57602 commit cac6ce1
Show file tree
Hide file tree
Showing 8 changed files with 1,660 additions and 7 deletions.
142 changes: 142 additions & 0 deletions dev/compare_databases_to_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import pandas as pd

def _lca_scores_compare(database_dict, method_dict):
# Dictionary to store DataFrames for each sector
sector_dataframes = {}

# Labels for the DataFrame columns
labels = [
"activity",
"activity key",
"reference product",
"location",
"method name",
"method unit",
"total",
]

# Loop through each sector in the database_dict
for sector, sector_data in database_dict.items():
# Initialize a dictionary to hold DataFrames for each method in the current sector
method_dataframes = {}

# Loop through each method in method_dict
for meth_key, meth_info in method_dict.items():
data = [] # Initialize a new list to hold data for the current method

# Extract the 'method name' tuple from the current method info
method_name = meth_info['method name']
method_unit = meth_info['unit']

# Now loop through each activity in the sector
for act in sector_data['activities']:
# Ensure the activity is an instance of the expected class
if not isinstance(act, bd.backends.peewee.proxies.Activity):
raise ValueError("`activities` must be an iterable of `Activity` instances")

# Perform LCA calculations
lca = bw.LCA({act: 1}, method_name)
lca.lci()
lca.lcia()

# Collect data for the current activity and method
data.append([
act["name"],
act.key,
act.get("reference product"),
act.get("location", "")[:25],
method_name,
method_unit,
lca.score,
])

# Convert the data list to a DataFrame and store it in the sector's dictionary
method_dataframes[meth_key] = pd.DataFrame(data, columns=labels)

# Store the method_dataframes dictionary in the sector_dataframes dictionary
sector_dataframes[sector] = method_dataframes

# Now `sector_dataframes` is a dictionary where each key is a sector, and the value is another dictionary with method names and their corresponding DataFrames
return sector_dataframes


import pandas as pd

def _relative_changes_df(database_dict_eco, database_dict_premise):

ecoinvent_scores = _lca_scores_compare(database_dict_eco)
premise_scores = _lca_scores_compare(database_dict_premise)

relative_dict = {}

# Iterate over sectors
for sector_key in ecoinvent_scores:
# Initialize the sector key in the output dictionary
if sector_key not in relative_dict:
relative_dict[sector_key] = {}

# Iterate over methods within the sector
for method_key in ecoinvent_scores[sector_key]:
# Check if the method_key exists in both dictionaries to avoid KeyError
if method_key in premise_scores.get(sector_key, {}):
# Get the corresponding DataFrames
df_ei = ecoinvent_scores[sector_key][method_key]
df_premise = premise_scores[sector_key][method_key]

#print(df_ei['activity key'])
#print(df_premise)

# Split the 'activity key' to extract the second part
df_ei['activity_code'] = df_ei['activity key'].apply(lambda x: x[1]) # Access the second element of the tuple
df_premise['activity_code'] = df_premise['activity key'].apply(lambda x: x[1])

# Merge the two dataframes based on the activity code and method name
merged_df = pd.merge(df_ei, df_premise, on=['activity_code', 'method name'], suffixes=('_ei', '_premise'))

# Calculate the relative change
merged_df['relative_change'] = ((merged_df['total_premise'] - merged_df['total_ei']) / merged_df['total_ei']) * 100

# Store the result in the dictionary
relative_dict[sector_key][method_key] = merged_df

return relative_dict

from dopo_excel import add_sector_marker

def relative_changes_db(database_dict_eco, database_dict_premise, excel_file):

relative_dict = (_relative_changes_df(database_dict_eco, database_dict_premise))

# Prepare to save each LCA score table to a different worksheet in the same Excel file

column_positions = {} #stores the indexes of columns for plotting
with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
for sector in relative_dict.keys():
relative_changes = relative_dict[sector]

for method, table in relative_changes.items():
# Create a DataFrame for the current LCA score table
df = pd.DataFrame(table)

# Add sector marker
df = add_sector_marker(df, sector) #!! ADJUST POSITION

# Sort the DataFrame by 'relative_change' from largest negative to largest positive
df = df.sort_values(by='relative_change', ascending=False)

# Add a 'rank' column based on the 'relative_change', ranking from most negative to least negative
df['rank'] = df['relative_change'].rank(ascending=False, method='dense').astype(int)

# Get the index values of columns
columns_of_interest = ["rank", "relative_change", "method", "method unit", ]
positions = {col: df.columns.get_loc(col) for col in columns_of_interest if col in df.columns}
column_positions[method] = positions

# Generate worksheet name
worksheet_name = f"{sector}_{method}"
if len(worksheet_name) > 31:
worksheet_name = worksheet_name[:31]

# Save the DataFrame to the Excel file in a new worksheet
df.to_excel(writer, sheet_name=worksheet_name, index=False)
return column_positions
193 changes: 193 additions & 0 deletions dev/lca_to_excl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
def sector_lca_scores(main_dict, method_dict):
'''
Generates the LCA score tables for activity list of each sector.
The tables contain total scores and cpc input contributions.
This is done by each method defined in the method dictionary.
:param main_dict: dictionary which is returned by process_yaml_files function
:param method_dict: dictionary which is created with MethodFinder class
It returns the main dictionary updated as scores dictionary which also holds the former information for each sector.
The LCA scores are stored by method name in the respective sector dictionary within the main dictionary.
'''

# Initialize scores_dict as a copy of main_dict
scores_dict = main_dict.copy()

# Loop through each sector in main_dict
for sector in scores_dict.keys():
# Extract activities for the current sector
sector_activities = scores_dict[sector]['activities']

# Calculate LCA scores using the specified method
lca_scores = compare_activities_multiple_methods(
activities_list=sector_activities,
methods=method_dict,
identifier=sector,
mode='absolute'
)

# Apply the small_inputs_to_other_column function with the cutoff value
lca_scores = small_inputs_to_other_column(lca_scores, cutoff=0.02)

# Save the LCA scores to the scores_dict
scores_dict[sector]['lca_scores'] = lca_scores

return scores_dict

# -----------------------------------------
# CREATING EXCEL SHEETS WITH LCA TABLES
# -----------------------------------------

def sector_lca_scores_to_excel_and_column_positions(scores_dict, excel_file_name):
"""
What it does:
- Creates a dataframe for each method and sector from the lca scores dictionary
- Before storing each df in a worksheet in an excel file it:
- shortens the column labels of the input (removing cpc code)
- adds a sector name marker for keeping track in excel (when plotting can use it for labeling)
- adds statistics for plotting
- creates a dictionary which holds the indexes to the columns we need to call for plotting, this makes it dynamic. Otherwise need to hardcode index column number for openpxyl.
What it returns:
- Returns the index positions dictionary where the key is "sector_method"
- Creates excel file as defined by user
"""

# Prepare to save each LCA score table to a different worksheet in the same Excel file
excel_file = excel_file_name
column_positions = {} #stores the indexes of columns for plotting
with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
for sector in scores_dict.keys():
lca_scores = scores_dict[sector]['lca_scores']
for method, table in lca_scores.items():
# Create a DataFrame for the current LCA score table
df = pd.DataFrame(table)

# Add sector marker
df = add_sector_marker(df, sector) #!! ADJUST POSITION

# Add statistics to the DataFrame
df = add_statistics(df)

# Get the index values of columns
columns_of_interest = ["total", "rank", "mean", "2std_abv", "2std_blw", "q1", "q3", "method", "method unit"]
positions = {col: df.columns.get_loc(col) for col in columns_of_interest if col in df.columns}
column_positions[method] = positions

# Find the first input column and add it to the positions dictionary
first_input_col_index = find_first_input_column(df)
if first_input_col_index is not None:
positions["first_input"] = first_input_col_index

# Store the positions for this method
column_positions[method] = positions

# remove cpc from input labels
df = clean_column_labels(df)

# Generate a worksheet name
worksheet_name = f"{method}" #f"{sector}_{method}"
if len(worksheet_name) > 31:
worksheet_name = worksheet_name[:31]

# Save the DataFrame to the Excel file in a new worksheet
df.to_excel(writer, sheet_name=worksheet_name, index=False)
return column_positions


def add_statistics(df, column_name='total'):

'''
It is called in the function sector_lca_scores_to_excel_and_column_positions
It adds statistical indicators to a dataframe based on total column which are used for plotting.
returns updated dataframe
'''

#Need a rank row to plot the total LCA scores in descending order (satter opepyxl function takes in non categorial values)
df['rank'] = df[column_name].rank(method="first", ascending="False")

# Calculate mean, standard deviation, and IQR
df['mean'] = df[column_name].mean()
df['2std_abv'] = df['mean'] + df[column_name].std() * 2
df['2std_blw'] = df['mean'] - df[column_name].std() * 2
df['q1'] = df[column_name].quantile(0.25)
df['q3'] = df[column_name].quantile(0.75)

# Reorder the columns to place the new columns after 'total'
cols = df.columns.tolist()
total_index = cols.index(column_name) + 1
new_cols = ['rank', 'mean', '2std_abv', '2std_blw', 'q1', 'q3']
cols = cols[:total_index] + new_cols + cols[total_index:-len(new_cols)]

return df[cols]


def find_first_input_column(df):
'''
It is called in the function sector_lca_scores_to_excel_and_column_positions. Needs to be called before clean_column_labels function.
Detects the first column in the dataframe which contains input contribution data and saves its index.
This is relevant for calling the right column for defining the to be plotted data dynamically as not all dataframes have the same column order (some contain "direct emissions" for instance).
'''

def clean_label(label):
return label if label is not None else 'Unnamed'

# Apply the cleaning function to all column names
df.columns = [clean_label(col) for col in df.columns]

# Regular expression pattern to match "Number: Name"
pattern = r'^\d+:\s*'

for idx, column in enumerate(df.columns):
if (column is not None and re.match(pattern, column)) or column == 'Unnamed' or column == 'direct emissions':
return idx

return None

def clean_column_labels(df):

'''
It is called in the function sector_lca_scores_to_excel_and_column_positions. Needs to be run after find_first_input_column.
It removes unnecessary numbers in the column header.
Returns df with formated column labels.
'''
# Function to remove numbers and colon from column names
def clean_label(label):
if label is None:
return 'Unnamed' # or return 'Unnamed' if you prefer a placeholder
return re.sub(r'^\d+:\s*', '', str(label))

# Apply the cleaning function to all column names
df.columns = [clean_label(col) for col in df.columns]

return df

def add_sector_marker(df, sector):
'''
It is called in the function sector_lca_scores_to_excel_and_column_positions.
It adds information about the sector for titel and labeling in plotting.
Returns df with added column.
'''

# Add sector marker column
df['sector']=str(sector) # potentially remove!
# Reorder the columns to move 'sector' after 'product'
columns = list(df.columns)

if 'product' in df.columns:
product_index = columns.index('product')
# Insert 'sector' after 'product'
columns.insert(product_index + 1, columns.pop(columns.index('sector')))
else:
# If 'product' does not exist, 'sector' remains in the last column
columns.append(columns.pop(columns.index('sector')))

# Reassign the DataFrame with the new column order
df = df[columns]
return df
Loading

0 comments on commit cac6ce1

Please sign in to comment.