1. Unit Test¶

1.1 Confirm that image-label merging logic works.¶

In [72]:
import pandas as pd

def test_image_label_merge():
    
    df_labels = pd.DataFrame({
        "image_id": ["img1.png", "img2.png"],
        "patient_id": [101, 102]})
    
    df_meta = pd.DataFrame({
        "patient_id": [101, 102],
        "pathology": ["BENIGN", "MALIGNANT"]})
    
    df = df_labels.merge(df_meta, on="patient_id", how="left")

    assert df.loc[0, 'pathology'] == "BENIGN", "Label mismatch for img1"
    assert df.loc[1, 'pathology'] == "MALIGNANT", "Label mismatch for img2"
    print("Test passed: image-label matching works.")

test_image_label_merge()
Test passed: image-label matching works.

2. Install Python Packages¶

2.1 Required Libraries¶

  • pandas -> For dataFrame manipulation
  • matplotlib -> Plotting and visualizing results
  • opencv-python -> Used to resize DICOM pixel arrays into training-size images
  • tensorflow -> Used to build and train the CNN
  • pydicom -> Loads DICOM medical image files into NumPy arrays
  • scikit-learn -> For train-test split, label encoding, confusion matrix, classification report
In [73]:
!pip install pandas matplotlib opencv-python tensorflow pydicom scikit-learn
Requirement already satisfied: pandas in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (2.0.3)
Requirement already satisfied: matplotlib in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (3.7.2)
Requirement already satisfied: opencv-python in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (4.11.0.86)
Requirement already satisfied: tensorflow in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (2.18.0)
Requirement already satisfied: pydicom in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (3.0.1)
Requirement already satisfied: scikit-learn in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (1.3.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (2023.3)
Requirement already satisfied: numpy>=1.21.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (1.26.4)
Requirement already satisfied: contourpy>=1.0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (1.0.5)
Requirement already satisfied: cycler>=0.10 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: packaging>=20.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (23.1)
Requirement already satisfied: pillow>=6.2.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (9.4.0)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: absl-py>=1.0.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.1.0)
Requirement already satisfied: astunparse>=1.6.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.6.3)
Requirement already satisfied: flatbuffers>=24.3.25 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (25.1.24)
Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.6.0)
Requirement already satisfied: google-pasta>=0.1.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.2.0)
Requirement already satisfied: libclang>=13.0.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (18.1.1)
Requirement already satisfied: opt-einsum>=2.3.2 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (3.4.0)
Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (5.29.3)
Requirement already satisfied: requests<3,>=2.21.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.31.0)
Requirement already satisfied: setuptools in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (68.0.0)
Requirement already satisfied: six>=1.12.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.16.0)
Requirement already satisfied: termcolor>=1.1.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.5.0)
Requirement already satisfied: typing-extensions>=3.6.6 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (4.12.2)
Requirement already satisfied: wrapt>=1.11.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.14.1)
Requirement already satisfied: grpcio<2.0,>=1.24.3 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.70.0)
Requirement already satisfied: tensorboard<2.19,>=2.18 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.18.0)
Requirement already satisfied: keras>=3.5.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (3.6.0)
Requirement already satisfied: h5py>=3.11.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (3.12.1)
Requirement already satisfied: ml-dtypes<0.5.0,>=0.4.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.4.1)
Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.37.1)
Requirement already satisfied: scipy>=1.5.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from scikit-learn) (1.11.1)
Requirement already satisfied: joblib>=1.1.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: wheel<1.0,>=0.23.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from astunparse>=1.6.0->tensorflow) (0.38.4)
Requirement already satisfied: rich in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from keras>=3.5.0->tensorflow) (13.9.2)
Requirement already satisfied: namex in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from keras>=3.5.0->tensorflow) (0.0.8)
Requirement already satisfied: optree in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from keras>=3.5.0->tensorflow) (0.13.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (1.26.16)
Requirement already satisfied: certifi>=2017.4.17 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2023.7.22)
Requirement already satisfied: markdown>=2.6.8 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorboard<2.19,>=2.18->tensorflow) (3.4.1)
Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorboard<2.19,>=2.18->tensorflow) (0.7.2)
Requirement already satisfied: werkzeug>=1.0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorboard<2.19,>=2.18->tensorflow) (2.2.3)
Requirement already satisfied: MarkupSafe>=2.1.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow) (2.1.1)
Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from rich->keras>=3.5.0->tensorflow) (2.2.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from rich->keras>=3.5.0->tensorflow) (2.15.1)
Requirement already satisfied: mdurl~=0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow) (0.1.0)

3. Iteration 1: First Model¶

3.1 DICOM file scanner¶

Builds a DataFrame containing the paths and metadata of all the images in CBIS-DDSM folder.

In [74]:
import os
import pandas as pd

cbis_root = '../data/manifest-1747663624077/CBIS-DDSM/'

image_paths = []

for root, dirs, files in os.walk(cbis_root):
    for file in files:
        if file.endswith('.dcm'):
            full_path = os.path.join(root, file)
            parts = full_path.split(os.sep)

            folder_name = parts[-4] if len(parts) >= 4 else ''
            try:
                patient_info = folder_name.split('_P_')
                if len(patient_info) == 2:
                    patient_id_raw, view = patient_info[1].split('_', 1)
                    patient_id = 'P_' + patient_id_raw

                    image_paths.append({
                        'image_path': full_path,
                        'patient_id': patient_id,
                        'view': view })
            except Exception as e:
                print(f"Skipping {full_path}: {e}")
                continue

df_images = pd.DataFrame(image_paths)
print(f"Total images found: {len(df_images)}")
df_images  
Total images found: 12
Out[74]:
image_path patient_id view
0 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00041 LEFT_CC
1 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00163 LEFT_MLO
2 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00141 LEFT_MLO
3 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00140 LEFT_MLO
4 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00141 LEFT_CC
5 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00038 LEFT_MLO
6 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00038 LEFT_CC
7 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00140 LEFT_CC
8 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00041 LEFT_MLO
9 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00163 LEFT_CC
10 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00100 RIGHT_MLO
11 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00100 RIGHT_CC

3.2 Merge pathology labels with image paths¶

In [75]:
labels_test_df = pd.read_csv('../data/calc_case_description_test_set.csv')

print("Sample IDs:", labels_test_df['patient_id'].unique()[:5])
labels_subset = labels_test_df[['patient_id', 'pathology']].drop_duplicates()
df_merged = df_images.merge(labels_subset, on='patient_id', how='left')

print(f"Total labeled images: {df_merged['pathology'].notnull().sum()} / {len(df_merged)}")
df_merged.head(12)
Sample IDs: ['P_00038' 'P_00041' 'P_00077' 'P_00100' 'P_00127']
Total labeled images: 14 / 14
Out[75]:
image_path patient_id view pathology
0 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00041 LEFT_CC BENIGN_WITHOUT_CALLBACK
1 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00163 LEFT_MLO BENIGN
2 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00141 LEFT_MLO BENIGN
3 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00140 LEFT_MLO BENIGN_WITHOUT_CALLBACK
4 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00141 LEFT_CC BENIGN
5 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00038 LEFT_MLO BENIGN
6 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00038 LEFT_MLO BENIGN_WITHOUT_CALLBACK
7 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00038 LEFT_CC BENIGN
8 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00038 LEFT_CC BENIGN_WITHOUT_CALLBACK
9 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00140 LEFT_CC BENIGN_WITHOUT_CALLBACK
10 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00041 LEFT_MLO BENIGN_WITHOUT_CALLBACK
11 ../data/manifest-1747663624077/CBIS-DDSM/Calc-... P_00163 LEFT_CC BENIGN

3.3 Save as CSV¶

Save merged dataset for training/processing

In [76]:
df_merged.to_csv('image_labels.csv', index=False)
print("CSV saved")
CSV saved

3.4 DICOM Preprocessing¶

check DICOM image preprocessing works correctly:

  • Loads CSV containing image paths and pathology labels.
  • Read a .dcm image file
  • Normalize pixel values to 0–1
  • Resize the image to 224×224
  • Visualizes the first image in the dataset with its true label BENIGN or MALIGNANT
In [77]:
import pandas as pd
import numpy as np
import pydicom
import cv2 
import matplotlib.pyplot as plt

df = pd.read_csv('image_labels.csv')

def load_and_preprocess_dicom(path, size=(224, 224)):
    dicom = pydicom.dcmread(path)
    img = dicom.pixel_array.astype(np.float32)
    
    img -= img.min()
    img /= img.max()
    
    img_resized = cv2.resize(img, size, interpolation=cv2.INTER_AREA)
    
    return img_resized

example_path = df['image_path'].iloc[0]
example_img = load_and_preprocess_dicom(example_path)

plt.imshow(example_img, cmap='gray')
plt.title(df['pathology'].iloc[0])
plt.axis('off')
plt.show()

3.5 Load All Images Into Arrays¶

In [78]:
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

images = []
labels = []

le = LabelEncoder()
df['pathology_encoded'] = le.fit_transform(df['pathology'])

for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        img = load_and_preprocess_dicom(row['image_path'])
        images.append(img)
        labels.append(row['pathology_encoded'])
    except Exception as e:
        print(f"Failed to process {row['image_path']}: {e}")

X = np.array(images).reshape(-1, 224, 224, 1) 
y = np.array(labels)

print("Data loaded. Shape:", X.shape, y.shape)
print("Classes:", le.classes_)
100%|███████████████████████████████████████████| 14/14 [00:01<00:00,  9.62it/s]
Data loaded. Shape: (14, 224, 224, 1) (14,)
Classes: ['BENIGN' 'BENIGN_WITHOUT_CALLBACK']

3.6 Build & Train the CNN¶

In [79]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

y_cat = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax') 
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=4, validation_data=(X_test, y_test))
Epoch 1/10
/Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
3/3 ━━━━━━━━━━━━━━━━━━━━ 1s 94ms/step - accuracy: 0.5994 - loss: 0.6512 - val_accuracy: 0.3333 - val_loss: 1.4282
Epoch 2/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 68ms/step - accuracy: 0.6761 - loss: 1.0614 - val_accuracy: 0.6667 - val_loss: 0.6576
Epoch 3/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 70ms/step - accuracy: 0.7841 - loss: 0.4869 - val_accuracy: 0.6667 - val_loss: 0.6139
Epoch 4/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 69ms/step - accuracy: 0.6449 - loss: 0.6840 - val_accuracy: 0.6667 - val_loss: 0.6537
Epoch 5/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 68ms/step - accuracy: 0.7074 - loss: 0.5216 - val_accuracy: 0.3333 - val_loss: 0.6782
Epoch 6/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 69ms/step - accuracy: 0.6307 - loss: 0.4953 - val_accuracy: 0.3333 - val_loss: 0.6649
Epoch 7/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 73ms/step - accuracy: 0.5994 - loss: 0.5461 - val_accuracy: 0.6667 - val_loss: 0.6184
Epoch 8/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 76ms/step - accuracy: 0.8153 - loss: 0.4038 - val_accuracy: 0.6667 - val_loss: 0.6598
Epoch 9/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 67ms/step - accuracy: 0.7216 - loss: 0.4577 - val_accuracy: 0.6667 - val_loss: 0.6401
Epoch 10/10
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 67ms/step - accuracy: 0.8778 - loss: 0.3657 - val_accuracy: 0.6667 - val_loss: 0.6339

3.7 A plot of accuracy/loss curves¶

In [80]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

3.8 Evaluate Model¶

Use test set and calculate:

  • Accuracy
  • Precision
  • Recall
  • F1-score
  • Confusion matrix
In [82]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

y_pred_probs = model.predict(X_test)

y_pred_classes = y_pred_probs.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1)

print(classification_report(y_true_classes, y_pred_classes, target_names=le.classes_))

cm = confusion_matrix(y_true_classes, y_pred_classes)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 25ms/step
                         precision    recall  f1-score   support

                 BENIGN       0.50      1.00      0.67         1
BENIGN_WITHOUT_CALLBACK       1.00      0.50      0.67         2

               accuracy                           0.67         3
              macro avg       0.75      0.75      0.67         3
           weighted avg       0.83      0.67      0.67         3

3.9 Visual Sample Predictions¶

  • Show a few test images with true vs predicted labels
In [83]:
for i in range(len(X_test)):
    plt.imshow(X_test[i].reshape(224, 224), cmap='gray')
    plt.title(f"True: {le.classes_[y_true_classes[i]]} | Predicted: {le.classes_[y_pred_classes[i]]}")
    plt.axis('off')
    plt.show()

4. Iteration 2: More Data and Improved Model Training¶

4.1 Match DICOM Files with Labels¶

In [84]:
import os
import pandas as pd

df_train = pd.read_csv('../data/calc_case_description_train_set.csv')
df_test = pd.read_csv('../data/calc_case_description_test_set.csv')
df_labels = pd.concat([df_train, df_test], ignore_index=True)

df_labels['patient_id'] = df_labels['patient_id'].str.strip()
df_labels['left or right breast'] = df_labels['left or right breast'].str.upper()
df_labels['image view'] = df_labels['image view'].str.upper()

cbis_root = '../data/manifest-1747932126245/CBIS-DDSM/'

image_data = []

for case_folder in os.listdir(cbis_root):
    full_case_path = os.path.join(cbis_root, case_folder)
    if not os.path.isdir(full_case_path):
        continue
    try:
        parts = case_folder.split('_')
        patient_id = f'P_{parts[2]}'
        side = parts[3]
        view = parts[4]
        for root, dirs, files in os.walk(full_case_path):
            for file in files:
                if file.endswith('.dcm'):
                    image_path = os.path.join(root, file)
                    image_data.append({
                        'image_path': image_path,
                        'patient_id': patient_id,
                        'side': side,
                        'view': view
                    })
    except Exception as e:
        print(f"Skipping {case_folder}: {e}")
        continue

df_images = pd.DataFrame(image_data)

df_merged = df_images.merge(
    df_labels,
    left_on=['patient_id', 'side', 'view'],
    right_on=['patient_id', 'left or right breast', 'image view'],
    how='inner'
)

df_final = df_merged[['image_path', 'patient_id', 'side', 'view', 'pathology']].copy()
df_final = df_final[df_final['pathology'].isin(['BENIGN', 'MALIGNANT'])].copy()
print(f"Total matched DICOMs: {len(df_final)}")
df_final.head()
Total matched DICOMs: 107
Out[84]:
image_path patient_id side view pathology
0 ../data/manifest-1747932126245/CBIS-DDSM/Calc-... P_00063 RIGHT CC MALIGNANT
1 ../data/manifest-1747932126245/CBIS-DDSM/Calc-... P_00071 RIGHT MLO BENIGN
2 ../data/manifest-1747932126245/CBIS-DDSM/Calc-... P_00062 LEFT CC BENIGN
3 ../data/manifest-1747932126245/CBIS-DDSM/Calc-... P_00649 LEFT CC BENIGN
4 ../data/manifest-1747932126245/CBIS-DDSM/Calc-... P_01154 LEFT MLO MALIGNANT

4.2 Save as CSV¶

In [85]:
df_final.to_csv('../data/image_labels.csv', index=False)
print("Saved CSV")
Saved CSV

4.3 Convert DICOMs to Numpy Arrays for Training¶

In [86]:
import pydicom
import cv2
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

IMG_SIZE = 224 
X = []
y = []

for _, row in tqdm(df_final.iterrows(), total=len(df_final)):
    try:
        dcm = pydicom.dcmread(row['image_path'])
        img = dcm.pixel_array.astype(np.float32)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        img = img / 255.0  # normalize
        img = np.expand_dims(img, axis=-1) 

        X.append(img)
        y.append(row['pathology'])

    except Exception as e:
        print(f"Error with {row['image_path']}: {e}")

X = np.array(X)
y = np.array(y)

print(f"X shape: {X.shape}, Y shape: {y.shape}")
100%|█████████████████████████████████████████| 107/107 [00:03<00:00, 32.59it/s]
X shape: (107, 224, 224, 1), Y shape: (107,)

4.4 Encode Labels and Split Into Train/Test Sets¶

In [87]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
Train samples: 85, Test samples: 22

4.5 CNN Model Training¶

In [88]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),

    Dense(1, activation='sigmoid') 
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=8,
    callbacks=[early_stop]
)
Epoch 1/20
/Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
11/11 ━━━━━━━━━━━━━━━━━━━━ 2s 111ms/step - accuracy: 0.5419 - loss: 99.7223 - val_accuracy: 0.5000 - val_loss: 9.1453
Epoch 2/20
11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 109ms/step - accuracy: 0.7035 - loss: 3.4935 - val_accuracy: 0.4545 - val_loss: 0.7975
Epoch 3/20
11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 127ms/step - accuracy: 0.8602 - loss: 0.5257 - val_accuracy: 0.5000 - val_loss: 0.8758
Epoch 4/20
11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 107ms/step - accuracy: 0.9224 - loss: 0.2221 - val_accuracy: 0.5000 - val_loss: 0.8598
Epoch 5/20
11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 107ms/step - accuracy: 0.9326 - loss: 0.2097 - val_accuracy: 0.5000 - val_loss: 1.0553

4.6 Evaluate the Model Performance¶

In [89]:
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 111ms/step
In [90]:
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BENIGN', 'MALIGNANT']))
Confusion Matrix:
[[3 7]
 [5 7]]

Classification Report:
              precision    recall  f1-score   support

      BENIGN       0.38      0.30      0.33        10
   MALIGNANT       0.50      0.58      0.54        12

    accuracy                           0.45        22
   macro avg       0.44      0.44      0.44        22
weighted avg       0.44      0.45      0.45        22

5. Iteration 3: Final¶

5.1 Match DICOM Files with Labels¶

In [91]:
import os
import pandas as pd

df_train = pd.read_csv('../data/calc_case_description_train_set.csv')
df_test = pd.read_csv('../data/calc_case_description_test_set.csv')
df_labels = pd.concat([df_train, df_test], ignore_index=True)

df_labels['patient_id'] = df_labels['patient_id'].str.strip()
df_labels['left or right breast'] = df_labels['left or right breast'].str.upper()
df_labels['image view'] = df_labels['image view'].str.upper()

cbis_root_new = '../data/manifest-1747944277650/CBIS-DDSM/'

new_image_data = []

for case_folder in os.listdir(cbis_root_new):
    full_case_path = os.path.join(cbis_root_new, case_folder)

    if not os.path.isdir(full_case_path):
        continue

    try:
        parts = case_folder.split('_')
        patient_id = f'P_{parts[2]}'
        side = parts[3]
        view = parts[4]

        for root, dirs, files in os.walk(full_case_path):
            for file in files:
                if file.endswith('.dcm'):
                    image_path = os.path.join(root, file)
                    new_image_data.append({
                        'image_path': image_path,
                        'patient_id': patient_id,
                        'side': side,
                        'view': view
                    })
    except Exception as e:
        print(f"Skipping {case_folder}: {e}")
        continue

df_new_images = pd.DataFrame(new_image_data)

df_new_merged = df_new_images.merge(
    df_labels,
    left_on=['patient_id', 'side', 'view'],
    right_on=['patient_id', 'left or right breast', 'image view'],
    how='inner'
)

df_new_final = df_new_merged[['image_path', 'patient_id', 'side', 'view', 'pathology']].copy()
df_new_final = df_new_final[df_new_final['pathology'].isin(['BENIGN', 'MALIGNANT'])]

print(f"Matched new DICOMs: {len(df_new_final)}")
df_new_final.head()
Matched new DICOMs: 344
Out[91]:
image_path patient_id side view pathology
0 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_00467 RIGHT MLO MALIGNANT
1 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_00445 LEFT MLO MALIGNANT
2 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_00243 LEFT CC MALIGNANT
3 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_00377 LEFT MLO BENIGN
4 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_00297 RIGHT CC BENIGN

5.2 Merge old and new datasets¶

In [92]:
df_combined = pd.concat([df_final, df_new_final], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Total dataset size after merge: {len(df_combined)}")
df_combined.head()
Total dataset size after merge: 451
Out[92]:
image_path patient_id side view pathology
0 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_01004 LEFT CC BENIGN
1 ../data/manifest-1747932126245/CBIS-DDSM/Calc-... P_00663 RIGHT MLO BENIGN
2 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_01022 LEFT MLO BENIGN
3 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_00005 RIGHT MLO MALIGNANT
4 ../data/manifest-1747944277650/CBIS-DDSM/Calc-... P_00049 RIGHT MLO MALIGNANT

5.3 Convert DICOMs to Numpy Arrays for Training¶

In [93]:
import pydicom
import cv2
import numpy as np
from tqdm import tqdm

IMG_SIZE = 224
X = []
y = []

for _, row in tqdm(df_combined.iterrows(), total=len(df_combined)):
    try:
        dcm = pydicom.dcmread(row['image_path'])
        img = dcm.pixel_array.astype(np.float32)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        img = img / 255.0  
        img = np.expand_dims(img, axis=-1)  
        X.append(img)
        y.append(row['pathology'])
    except Exception as e:
        print(f"Error loading image: {row['image_path']}\n{e}")

X = np.array(X)
y = np.array(y)
print(f"X shape: {X.shape}, Y shape: {y.shape}")
100%|█████████████████████████████████████████| 451/451 [00:13<00:00, 32.57it/s]
X shape: (451, 224, 224, 1), Y shape: (451,)

5.4 Encode Labels and Split Into Train/Test Sets¶

In [94]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
Train size: 360, Test size: 91

5.5 CNN Model Training¶

In [101]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),

    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=8,
    callbacks=[early_stop]
)
Epoch 1/20
/Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 105ms/step - accuracy: 0.5406 - loss: 154.5802 - val_accuracy: 0.7582 - val_loss: 0.5514
Epoch 2/20
45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.8028 - loss: 0.4685 - val_accuracy: 0.7473 - val_loss: 0.5173
Epoch 3/20
45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.8554 - loss: 0.2990 - val_accuracy: 0.8132 - val_loss: 0.4883
Epoch 4/20
45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.9445 - loss: 0.1448 - val_accuracy: 0.7912 - val_loss: 0.5455
Epoch 5/20
45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.9773 - loss: 0.1115 - val_accuracy: 0.8242 - val_loss: 0.5814
Epoch 6/20
45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 104ms/step - accuracy: 0.9909 - loss: 0.0599 - val_accuracy: 0.8352 - val_loss: 0.6627

5.6 Evaluate the Model Performance¶

In [102]:
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()

from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BENIGN', 'MALIGNANT']))
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 90ms/step
Confusion Matrix:
[[42  7]
 [10 32]]

Classification Report:
              precision    recall  f1-score   support

      BENIGN       0.81      0.86      0.83        49
   MALIGNANT       0.82      0.76      0.79        42

    accuracy                           0.81        91
   macro avg       0.81      0.81      0.81        91
weighted avg       0.81      0.81      0.81        91

5.7 A plot of accuracy/loss curves¶

In [103]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

5.8 Visual Sample Predictions¶

In [104]:
import random
num_samples = 6
indices = random.sample(range(len(X_test)), num_samples)

plt.figure(figsize=(15, 10))

for i, idx in enumerate(indices):
    img = X_test[idx].squeeze()
    true_label = y_test[idx]
    pred_prob = model.predict(X_test[idx:idx+1])[0][0]
    pred_label = int(pred_prob > 0.5)

    plt.subplot(2, 3, i+1)
    plt.imshow(img, cmap='gray')
    plt.title(f"True: {'MALIGNANT' if true_label else 'BENIGN'}\nPred: {'MALIGNANT' if pred_label else 'BENIGN'} ({pred_prob:.2f})")
    plt.axis('off')

plt.tight_layout()
plt.show()
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step
In [ ]: