2. ML modeling and Shapley Feature Extraction¶

  • Load structured data and perform exploratory data analysis (EDA).
  • Modeling and Feature Extraction.
In [1]:
import os
import sys
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"
import warnings

warnings.filterwarnings('ignore')
os.environ["KMP_WARNINGS"] = "0"

from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler
from sklearn.decomposition import PCA
import umap.umap_ as umap 

seed = 1234
np.random.seed(seed)
random.seed(seed)
In [2]:
df_pheno = pd.read_csv("../data/brca_pheno.csv")
In [3]:
print("Loading normalized mRNA expressions on TCGA-BRCA GDC samples...")
df_gdc = pd.read_csv("../data/TMP_20230209/BRCA_v12_20210228.tsv", sep="\t")
df_gdc = df_gdc[(~df_gdc.iloc[:, 2:].isna().all(axis=1)) & (~(df_gdc.iloc[:, 2:] == 0).all(axis=1))]
expression_cols = [col for col in df_gdc.columns if 'GEXP' in col]
df_gdc = df_gdc[['BRCA', 'Labels'] + expression_cols]
df_gdc.columns = [col.split(":")[3] if ':' in col else col for col in df_gdc.columns]

if df_gdc.columns.duplicated().any():
    print("Duplicate columns detected in GDC data. Removing duplicates...")
    df_gdc = df_gdc.loc[:, ~df_gdc.columns.duplicated()]

print("Joining grip query result on BRCA subtype's with normalized mRNA expressions...")
df_pheno.rename(columns={"secondary_identifier": "BRCA"}, inplace=True)
df_gdc = df_gdc.merge(df_pheno[['BRCA', 'subtype']], on='BRCA', how='left')
df_gdc.drop('Labels', axis=1, inplace=True)
df_gdc.rename(columns={"subtype": "Labels"}, inplace=True)
df_gdc.index = df_gdc.BRCA
demo_order = ['Labels'] + list(df_gdc.columns[3:10])
Loading normalized mRNA expressions on TCGA-BRCA GDC samples...
Duplicate columns detected in GDC data. Removing duplicates...
Joining grip query result on BRCA subtype's with normalized mRNA expressions...
In [4]:
subtype_colors = {
    "LumA": "#44AA99",  
    "LumB": "#FFC107",  
    "Basal": "#AA4499", 
    "Her2": "#449AE4"   
}

print("EDA - Evaluating data distribution...")
count_per_label = df_gdc['Labels'].value_counts()

plt.figure(figsize=(6, 4))
sns.barplot(x=count_per_label.index, y=count_per_label.values, palette=[subtype_colors[label] for label in count_per_label.index])
plt.title("Molecular subtype counts", fontsize=12)
plt.xlabel("Subtype", fontsize=10), plt.ylabel("Count", fontsize=10)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# UMAP on TCGA-BRCA normalized scRNA-Seq mRNA expression 
gene_expression_cols = df_gdc.drop(columns=["Labels", "BRCA"]).columns  
df_gdc_scaled = df_gdc[['Labels'] + list(gene_expression_cols)].copy()

scaler = StandardScaler()
df_gdc_scaled[gene_expression_cols] = scaler.fit_transform(df_gdc_scaled[gene_expression_cols])

umap_reducer = umap.UMAP(n_components=2, random_state=seed)
umap_embedding = umap_reducer.fit_transform(df_gdc_scaled[gene_expression_cols])
df_gdc_scaled['UMAP1'] = umap_embedding[:, 0]
df_gdc_scaled['UMAP2'] = umap_embedding[:, 1]
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_gdc_scaled, x="UMAP1", y="UMAP2", hue="Labels", palette=subtype_colors, s=60, alpha=0.8)
plt.title("UMAP: mRNA expression - TCGA-BRCA cohort")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.legend(title="Subtype")
plt.grid(True)
plt.tight_layout()
plt.show()
EDA - Evaluating data distribution...
No description has been provided for this image
No description has been provided for this image
  • Luminal A and Luminal B are both classified as hormone receptor-positive (HR+), meaning they express estrogen and progesterone receptors.
  • Luminal A is usually HER2-negative while Luminal B may be HER2-positive, which can impact treatment options.
  • Basal tumors lack estrogen receptors, making them "triple negative" (ER-, PR-, HER2-) while Luminal cancers are typically hormone receptor positive (ER+) and have a better outlook with hormone therapy options.
In [5]:
import shap
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import logging

tf.get_logger().setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)
seed = 1234
tf.random.set_seed(seed)
np.random.seed(seed)
# ----------------------------
# data prep
# ----------------------------
# subtype mappings 
class_maps = {0: "LumA", 1: "LumB", 2: "Basal", 3: "Her2"}

# remove unwanted columns
cols_to_drop = [col for col in ["Labels", "BRCA"] if col in df_gdc_scaled.columns]
pca_cols = [col for col in df_gdc_scaled.columns if str(col).startswith("PCA")] 
cols_to_drop.extend(pca_cols)

X_df = df_gdc_scaled.drop(columns=cols_to_drop).apply(pd.to_numeric, errors='coerce').fillna(0)
feature_names = list(X_df.columns)
X = X_df.values  # features (genes - standard ensemble gene names)

y = df_gdc['Labels'].astype('category').cat.codes
y_onehot = to_categorical(y, num_classes=len(class_maps))
X, y_onehot = shuffle(X, y_onehot, random_state=seed)

# ----------------------------
# f1 score function for tensorflow models metrics 
# ----------------------------
@tf.function(reduce_retracing=True)
def tf_f1_score(y_true, y_pred):
    y_pred = tf.round(y_pred) 
    tp = tf.reduce_sum(tf.cast(y_true * y_pred, 'float32'), axis=0)
    precision = tp / (tf.reduce_sum(tf.cast(y_pred, 'float32'), axis=0) + tf.keras.backend.epsilon())
    recall = tp / (tf.reduce_sum(tf.cast(y_true, 'float32'), axis=0) + tf.keras.backend.epsilon())
    f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
    return tf.reduce_mean(f1)
    
# ----------------------------
# simple tensorflow.keras NN model
# ----------------------------
def create_model(input_shape, num_classes):
    """Simple feed-forward neural network with two hidden layers: 
        - deep layers: fully connected layers with 64 neurons each, using the ReLU activation function.
        - dropout: regularization technique to prevent overfitting by randomly dropping neurons during training.
        - Final layer uses the softmax activation function for multi-class classification (to output probabilities for each class). 
    """
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)), Dropout(0.2), Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')  
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf_f1_score])
    return model

# ----------------------------
# cv to examine model performance 
# ----------------------------
k_folds = 10 # max n fold desired 
kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed)

top_N = 50 # number of top features you want per fold
sample_data = X[:len(X)] # fixed sample of data 
fold_accuracies = []
fold_f1 = []
fold_number = 1 # start from fold 1

for train_idx, test_idx in kf.split(X): # split data into training and testing sets 
    print(f"\n ===== Fold {fold_number}/{k_folds} ===== ")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y_onehot[train_idx], y_onehot[test_idx]
    
    model = create_model(X.shape[1], len(class_maps)) # create NN model and train
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    acc = accuracy_score(y_test_classes, y_pred_classes)
    test_loss, test_accuracy, test_f1 = model.evaluate(X_test, y_test, verbose=0)
    print(f"Fold {fold_number} Accuracy: {acc:.4f}, f1-score: {test_f1:.4f}")

    fold_accuracies.append(acc)
    fold_f1.append(test_f1)
    fold_number += 1 # move to next fold

mean_acc = np.mean(fold_accuracies)
std_acc = np.std(fold_accuracies)
mean_f1 = np.mean(fold_f1)
std_f1 = np.std(fold_f1)
print(f"\nOveral CV NN model's average performance:")
print(f"Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")
print(f"F1 score: {mean_f1:.4f} ± {std_f1:.4f}")
 ===== Fold 1/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 1 Accuracy: 0.9200, f1-score: 0.8049

 ===== Fold 2/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 2 Accuracy: 0.8300, f1-score: 0.6146

 ===== Fold 3/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 3 Accuracy: 0.9100, f1-score: 0.8177

 ===== Fold 4/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 4 Accuracy: 0.9000, f1-score: 0.8049

 ===== Fold 5/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 5 Accuracy: 0.8800, f1-score: 0.8188

 ===== Fold 6/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 6 Accuracy: 0.9091, f1-score: 0.6903

 ===== Fold 7/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 7 Accuracy: 0.8485, f1-score: 0.7844

 ===== Fold 8/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 8 Accuracy: 0.8283, f1-score: 0.7049

 ===== Fold 9/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 9 Accuracy: 0.8687, f1-score: 0.6276

 ===== Fold 10/10 ===== 
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
Fold 10 Accuracy: 0.8889, f1-score: 0.7227

Overal CV NN model's average performance:
Accuracy: 0.8783 ± 0.0318
F1 score: 0.7391 ± 0.0742
In [ ]:
# ----------------------------
# Build model on all data
# ----------------------------
final_model = create_model(X.shape[1], len(class_maps))
final_model.fit(X, y_onehot, epochs=10, batch_size=32, verbose=1)

# ----------------------------
# SHAP features
# https://shap.readthedocs.io
# https://www.aidancooper.co.uk/a-non-technical-guide-to-interpreting-shap-analyses
# ----------------------------
_explainer = shap.DeepExplainer(final_model, sample_data) 
_shap_values = _explainer.shap_values(sample_data)
_shap_values = np.transpose(_shap_values, (2, 0, 1)) 

_features_by_subtype = {}
for i, subtype in class_maps.items():
    shap_obj = shap.Explanation(values=_shap_values[i], feature_names=feature_names)
    shap_importance = shap_obj.abs.sum(0)
    top_indices = np.argsort(shap_importance.values)[-top_N:]
    top_features = [feature_names[idx] for idx in top_indices]
    _features_by_subtype[subtype] = top_features
    print(f"\n{subtype} top {top_N} features: {top_features}\n")
Epoch 1/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.7202 - loss: 1.1888 - tf_f1_score: 0.5501   
Epoch 2/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.9063 - loss: 0.4544 - tf_f1_score: 0.8507 
Epoch 3/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.9422 - loss: 0.2248 - tf_f1_score: 0.9040 
Epoch 4/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.9591 - loss: 0.1618 - tf_f1_score: 0.9173 
Epoch 5/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.9659 - loss: 0.1616 - tf_f1_score: 0.9250 
Epoch 6/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.9820 - loss: 0.1806 - tf_f1_score: 0.9353 
Epoch 7/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.9824 - loss: 0.0753 - tf_f1_score: 0.9452     
Epoch 8/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.9914 - loss: 0.0264 - tf_f1_score: 0.9559     
Epoch 9/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.9921 - loss: 0.0517 - tf_f1_score: 0.9522     
Epoch 10/10
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.9839 - loss: 0.0684 - tf_f1_score: 0.9309     
In [ ]:
output_filename = "../data/selected_features_by_subtype2.json"
with open(output_filename, "w") as f:
    json.dump(_features_by_subtype, f, indent=4)