import pandas as pd
def test_image_label_merge():
df_labels = pd.DataFrame({
"image_id": ["img1.png", "img2.png"],
"patient_id": [101, 102]})
df_meta = pd.DataFrame({
"patient_id": [101, 102],
"pathology": ["BENIGN", "MALIGNANT"]})
df = df_labels.merge(df_meta, on="patient_id", how="left")
assert df.loc[0, 'pathology'] == "BENIGN", "Label mismatch for img1"
assert df.loc[1, 'pathology'] == "MALIGNANT", "Label mismatch for img2"
print("Test passed: image-label matching works.")
test_image_label_merge()
Test passed: image-label matching works.
!pip install pandas matplotlib opencv-python tensorflow pydicom scikit-learn
Requirement already satisfied: pandas in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (2.0.3) Requirement already satisfied: matplotlib in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (3.7.2) Requirement already satisfied: opencv-python in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (4.11.0.86) Requirement already satisfied: tensorflow in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (2.18.0) Requirement already satisfied: pydicom in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (3.0.1) Requirement already satisfied: scikit-learn in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (1.3.0) Requirement already satisfied: python-dateutil>=2.8.2 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (2023.3) Requirement already satisfied: numpy>=1.21.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from pandas) (1.26.4) Requirement already satisfied: contourpy>=1.0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (1.0.5) Requirement already satisfied: cycler>=0.10 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (1.4.4) Requirement already satisfied: packaging>=20.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (23.1) Requirement already satisfied: pillow>=6.2.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (9.4.0) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from matplotlib) (3.0.9) Requirement already satisfied: absl-py>=1.0.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.1.0) Requirement already satisfied: astunparse>=1.6.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.6.3) Requirement already satisfied: flatbuffers>=24.3.25 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (25.1.24) Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.6.0) Requirement already satisfied: google-pasta>=0.1.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.2.0) Requirement already satisfied: libclang>=13.0.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (18.1.1) Requirement already satisfied: opt-einsum>=2.3.2 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (3.4.0) Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (5.29.3) Requirement already satisfied: requests<3,>=2.21.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.31.0) Requirement already satisfied: setuptools in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (68.0.0) Requirement already satisfied: six>=1.12.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.16.0) Requirement already satisfied: termcolor>=1.1.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.5.0) Requirement already satisfied: typing-extensions>=3.6.6 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (4.12.2) Requirement already satisfied: wrapt>=1.11.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.14.1) Requirement already satisfied: grpcio<2.0,>=1.24.3 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (1.70.0) Requirement already satisfied: tensorboard<2.19,>=2.18 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (2.18.0) Requirement already satisfied: keras>=3.5.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (3.6.0) Requirement already satisfied: h5py>=3.11.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (3.12.1) Requirement already satisfied: ml-dtypes<0.5.0,>=0.4.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.4.1) Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorflow) (0.37.1) Requirement already satisfied: scipy>=1.5.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from scikit-learn) (1.11.1) Requirement already satisfied: joblib>=1.1.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from scikit-learn) (1.2.0) Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from scikit-learn) (2.2.0) Requirement already satisfied: wheel<1.0,>=0.23.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from astunparse>=1.6.0->tensorflow) (0.38.4) Requirement already satisfied: rich in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from keras>=3.5.0->tensorflow) (13.9.2) Requirement already satisfied: namex in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from keras>=3.5.0->tensorflow) (0.0.8) Requirement already satisfied: optree in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from keras>=3.5.0->tensorflow) (0.13.0) Requirement already satisfied: charset-normalizer<4,>=2 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (1.26.16) Requirement already satisfied: certifi>=2017.4.17 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2023.7.22) Requirement already satisfied: markdown>=2.6.8 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorboard<2.19,>=2.18->tensorflow) (3.4.1) Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorboard<2.19,>=2.18->tensorflow) (0.7.2) Requirement already satisfied: werkzeug>=1.0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from tensorboard<2.19,>=2.18->tensorflow) (2.2.3) Requirement already satisfied: MarkupSafe>=2.1.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow) (2.1.1) Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from rich->keras>=3.5.0->tensorflow) (2.2.0) Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from rich->keras>=3.5.0->tensorflow) (2.15.1) Requirement already satisfied: mdurl~=0.1 in /Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow) (0.1.0)
Builds a DataFrame containing the paths and metadata of all the images in CBIS-DDSM folder.
import os
import pandas as pd
cbis_root = '../data/manifest-1747663624077/CBIS-DDSM/'
image_paths = []
for root, dirs, files in os.walk(cbis_root):
for file in files:
if file.endswith('.dcm'):
full_path = os.path.join(root, file)
parts = full_path.split(os.sep)
folder_name = parts[-4] if len(parts) >= 4 else ''
try:
patient_info = folder_name.split('_P_')
if len(patient_info) == 2:
patient_id_raw, view = patient_info[1].split('_', 1)
patient_id = 'P_' + patient_id_raw
image_paths.append({
'image_path': full_path,
'patient_id': patient_id,
'view': view })
except Exception as e:
print(f"Skipping {full_path}: {e}")
continue
df_images = pd.DataFrame(image_paths)
print(f"Total images found: {len(df_images)}")
df_images
Total images found: 12
| image_path | patient_id | view | |
|---|---|---|---|
| 0 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00041 | LEFT_CC |
| 1 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00163 | LEFT_MLO |
| 2 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00141 | LEFT_MLO |
| 3 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00140 | LEFT_MLO |
| 4 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00141 | LEFT_CC |
| 5 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00038 | LEFT_MLO |
| 6 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00038 | LEFT_CC |
| 7 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00140 | LEFT_CC |
| 8 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00041 | LEFT_MLO |
| 9 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00163 | LEFT_CC |
| 10 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00100 | RIGHT_MLO |
| 11 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00100 | RIGHT_CC |
labels_test_df = pd.read_csv('../data/calc_case_description_test_set.csv')
print("Sample IDs:", labels_test_df['patient_id'].unique()[:5])
labels_subset = labels_test_df[['patient_id', 'pathology']].drop_duplicates()
df_merged = df_images.merge(labels_subset, on='patient_id', how='left')
print(f"Total labeled images: {df_merged['pathology'].notnull().sum()} / {len(df_merged)}")
df_merged.head(12)
Sample IDs: ['P_00038' 'P_00041' 'P_00077' 'P_00100' 'P_00127'] Total labeled images: 14 / 14
| image_path | patient_id | view | pathology | |
|---|---|---|---|---|
| 0 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00041 | LEFT_CC | BENIGN_WITHOUT_CALLBACK |
| 1 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00163 | LEFT_MLO | BENIGN |
| 2 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00141 | LEFT_MLO | BENIGN |
| 3 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00140 | LEFT_MLO | BENIGN_WITHOUT_CALLBACK |
| 4 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00141 | LEFT_CC | BENIGN |
| 5 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00038 | LEFT_MLO | BENIGN |
| 6 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00038 | LEFT_MLO | BENIGN_WITHOUT_CALLBACK |
| 7 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00038 | LEFT_CC | BENIGN |
| 8 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00038 | LEFT_CC | BENIGN_WITHOUT_CALLBACK |
| 9 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00140 | LEFT_CC | BENIGN_WITHOUT_CALLBACK |
| 10 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00041 | LEFT_MLO | BENIGN_WITHOUT_CALLBACK |
| 11 | ../data/manifest-1747663624077/CBIS-DDSM/Calc-... | P_00163 | LEFT_CC | BENIGN |
Save merged dataset for training/processing
df_merged.to_csv('image_labels.csv', index=False)
print("CSV saved")
CSV saved
check DICOM image preprocessing works correctly:
import pandas as pd
import numpy as np
import pydicom
import cv2
import matplotlib.pyplot as plt
df = pd.read_csv('image_labels.csv')
def load_and_preprocess_dicom(path, size=(224, 224)):
dicom = pydicom.dcmread(path)
img = dicom.pixel_array.astype(np.float32)
img -= img.min()
img /= img.max()
img_resized = cv2.resize(img, size, interpolation=cv2.INTER_AREA)
return img_resized
example_path = df['image_path'].iloc[0]
example_img = load_and_preprocess_dicom(example_path)
plt.imshow(example_img, cmap='gray')
plt.title(df['pathology'].iloc[0])
plt.axis('off')
plt.show()
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
images = []
labels = []
le = LabelEncoder()
df['pathology_encoded'] = le.fit_transform(df['pathology'])
for _, row in tqdm(df.iterrows(), total=len(df)):
try:
img = load_and_preprocess_dicom(row['image_path'])
images.append(img)
labels.append(row['pathology_encoded'])
except Exception as e:
print(f"Failed to process {row['image_path']}: {e}")
X = np.array(images).reshape(-1, 224, 224, 1)
y = np.array(labels)
print("Data loaded. Shape:", X.shape, y.shape)
print("Classes:", le.classes_)
100%|███████████████████████████████████████████| 14/14 [00:01<00:00, 9.62it/s]
Data loaded. Shape: (14, 224, 224, 1) (14,) Classes: ['BENIGN' 'BENIGN_WITHOUT_CALLBACK']
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
y_cat = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)
model = Sequential([
Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),
MaxPooling2D(pool_size=(2, 2)),
Conv2D(64, (3, 3), activation='relu'),
MaxPooling2D(pool_size=(2, 2)),
Flatten(),
Dense(64, activation='relu'),
Dropout(0.5),
Dense(2, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=4, validation_data=(X_test, y_test))
Epoch 1/10
/Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
3/3 ━━━━━━━━━━━━━━━━━━━━ 1s 94ms/step - accuracy: 0.5994 - loss: 0.6512 - val_accuracy: 0.3333 - val_loss: 1.4282 Epoch 2/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 68ms/step - accuracy: 0.6761 - loss: 1.0614 - val_accuracy: 0.6667 - val_loss: 0.6576 Epoch 3/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 70ms/step - accuracy: 0.7841 - loss: 0.4869 - val_accuracy: 0.6667 - val_loss: 0.6139 Epoch 4/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 69ms/step - accuracy: 0.6449 - loss: 0.6840 - val_accuracy: 0.6667 - val_loss: 0.6537 Epoch 5/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 68ms/step - accuracy: 0.7074 - loss: 0.5216 - val_accuracy: 0.3333 - val_loss: 0.6782 Epoch 6/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 69ms/step - accuracy: 0.6307 - loss: 0.4953 - val_accuracy: 0.3333 - val_loss: 0.6649 Epoch 7/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 73ms/step - accuracy: 0.5994 - loss: 0.5461 - val_accuracy: 0.6667 - val_loss: 0.6184 Epoch 8/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 76ms/step - accuracy: 0.8153 - loss: 0.4038 - val_accuracy: 0.6667 - val_loss: 0.6598 Epoch 9/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 67ms/step - accuracy: 0.7216 - loss: 0.4577 - val_accuracy: 0.6667 - val_loss: 0.6401 Epoch 10/10 3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 67ms/step - accuracy: 0.8778 - loss: 0.3657 - val_accuracy: 0.6667 - val_loss: 0.6339
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
y_pred_probs = model.predict(X_test)
y_pred_classes = y_pred_probs.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1)
print(classification_report(y_true_classes, y_pred_classes, target_names=le.classes_))
cm = confusion_matrix(y_true_classes, y_pred_classes)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 25ms/step precision recall f1-score support BENIGN 0.50 1.00 0.67 1 BENIGN_WITHOUT_CALLBACK 1.00 0.50 0.67 2 accuracy 0.67 3 macro avg 0.75 0.75 0.67 3 weighted avg 0.83 0.67 0.67 3
for i in range(len(X_test)):
plt.imshow(X_test[i].reshape(224, 224), cmap='gray')
plt.title(f"True: {le.classes_[y_true_classes[i]]} | Predicted: {le.classes_[y_pred_classes[i]]}")
plt.axis('off')
plt.show()
import os
import pandas as pd
df_train = pd.read_csv('../data/calc_case_description_train_set.csv')
df_test = pd.read_csv('../data/calc_case_description_test_set.csv')
df_labels = pd.concat([df_train, df_test], ignore_index=True)
df_labels['patient_id'] = df_labels['patient_id'].str.strip()
df_labels['left or right breast'] = df_labels['left or right breast'].str.upper()
df_labels['image view'] = df_labels['image view'].str.upper()
cbis_root = '../data/manifest-1747932126245/CBIS-DDSM/'
image_data = []
for case_folder in os.listdir(cbis_root):
full_case_path = os.path.join(cbis_root, case_folder)
if not os.path.isdir(full_case_path):
continue
try:
parts = case_folder.split('_')
patient_id = f'P_{parts[2]}'
side = parts[3]
view = parts[4]
for root, dirs, files in os.walk(full_case_path):
for file in files:
if file.endswith('.dcm'):
image_path = os.path.join(root, file)
image_data.append({
'image_path': image_path,
'patient_id': patient_id,
'side': side,
'view': view
})
except Exception as e:
print(f"Skipping {case_folder}: {e}")
continue
df_images = pd.DataFrame(image_data)
df_merged = df_images.merge(
df_labels,
left_on=['patient_id', 'side', 'view'],
right_on=['patient_id', 'left or right breast', 'image view'],
how='inner'
)
df_final = df_merged[['image_path', 'patient_id', 'side', 'view', 'pathology']].copy()
df_final = df_final[df_final['pathology'].isin(['BENIGN', 'MALIGNANT'])].copy()
print(f"Total matched DICOMs: {len(df_final)}")
df_final.head()
Total matched DICOMs: 107
| image_path | patient_id | side | view | pathology | |
|---|---|---|---|---|---|
| 0 | ../data/manifest-1747932126245/CBIS-DDSM/Calc-... | P_00063 | RIGHT | CC | MALIGNANT |
| 1 | ../data/manifest-1747932126245/CBIS-DDSM/Calc-... | P_00071 | RIGHT | MLO | BENIGN |
| 2 | ../data/manifest-1747932126245/CBIS-DDSM/Calc-... | P_00062 | LEFT | CC | BENIGN |
| 3 | ../data/manifest-1747932126245/CBIS-DDSM/Calc-... | P_00649 | LEFT | CC | BENIGN |
| 4 | ../data/manifest-1747932126245/CBIS-DDSM/Calc-... | P_01154 | LEFT | MLO | MALIGNANT |
df_final.to_csv('../data/image_labels.csv', index=False)
print("Saved CSV")
Saved CSV
import pydicom
import cv2
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
IMG_SIZE = 224
X = []
y = []
for _, row in tqdm(df_final.iterrows(), total=len(df_final)):
try:
dcm = pydicom.dcmread(row['image_path'])
img = dcm.pixel_array.astype(np.float32)
img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
img = img / 255.0 # normalize
img = np.expand_dims(img, axis=-1)
X.append(img)
y.append(row['pathology'])
except Exception as e:
print(f"Error with {row['image_path']}: {e}")
X = np.array(X)
y = np.array(y)
print(f"X shape: {X.shape}, Y shape: {y.shape}")
100%|█████████████████████████████████████████| 107/107 [00:03<00:00, 32.59it/s]
X shape: (107, 224, 224, 1), Y shape: (107,)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
Train samples: 85, Test samples: 22
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
model = Sequential([
Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),
MaxPooling2D(pool_size=(2, 2)),
Conv2D(64, (3, 3), activation='relu'),
MaxPooling2D(pool_size=(2, 2)),
Dropout(0.25),
Flatten(),
Dense(64, activation='relu'),
Dropout(0.5),
Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
X_train, y_train,
validation_data=(X_test, y_test),
epochs=20,
batch_size=8,
callbacks=[early_stop]
)
Epoch 1/20
/Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
11/11 ━━━━━━━━━━━━━━━━━━━━ 2s 111ms/step - accuracy: 0.5419 - loss: 99.7223 - val_accuracy: 0.5000 - val_loss: 9.1453 Epoch 2/20 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 109ms/step - accuracy: 0.7035 - loss: 3.4935 - val_accuracy: 0.4545 - val_loss: 0.7975 Epoch 3/20 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 127ms/step - accuracy: 0.8602 - loss: 0.5257 - val_accuracy: 0.5000 - val_loss: 0.8758 Epoch 4/20 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 107ms/step - accuracy: 0.9224 - loss: 0.2221 - val_accuracy: 0.5000 - val_loss: 0.8598 Epoch 5/20 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 107ms/step - accuracy: 0.9326 - loss: 0.2097 - val_accuracy: 0.5000 - val_loss: 1.0553
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 111ms/step
from sklearn.metrics import confusion_matrix, classification_report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BENIGN', 'MALIGNANT']))
Confusion Matrix:
[[3 7]
[5 7]]
Classification Report:
precision recall f1-score support
BENIGN 0.38 0.30 0.33 10
MALIGNANT 0.50 0.58 0.54 12
accuracy 0.45 22
macro avg 0.44 0.44 0.44 22
weighted avg 0.44 0.45 0.45 22
import os
import pandas as pd
df_train = pd.read_csv('../data/calc_case_description_train_set.csv')
df_test = pd.read_csv('../data/calc_case_description_test_set.csv')
df_labels = pd.concat([df_train, df_test], ignore_index=True)
df_labels['patient_id'] = df_labels['patient_id'].str.strip()
df_labels['left or right breast'] = df_labels['left or right breast'].str.upper()
df_labels['image view'] = df_labels['image view'].str.upper()
cbis_root_new = '../data/manifest-1747944277650/CBIS-DDSM/'
new_image_data = []
for case_folder in os.listdir(cbis_root_new):
full_case_path = os.path.join(cbis_root_new, case_folder)
if not os.path.isdir(full_case_path):
continue
try:
parts = case_folder.split('_')
patient_id = f'P_{parts[2]}'
side = parts[3]
view = parts[4]
for root, dirs, files in os.walk(full_case_path):
for file in files:
if file.endswith('.dcm'):
image_path = os.path.join(root, file)
new_image_data.append({
'image_path': image_path,
'patient_id': patient_id,
'side': side,
'view': view
})
except Exception as e:
print(f"Skipping {case_folder}: {e}")
continue
df_new_images = pd.DataFrame(new_image_data)
df_new_merged = df_new_images.merge(
df_labels,
left_on=['patient_id', 'side', 'view'],
right_on=['patient_id', 'left or right breast', 'image view'],
how='inner'
)
df_new_final = df_new_merged[['image_path', 'patient_id', 'side', 'view', 'pathology']].copy()
df_new_final = df_new_final[df_new_final['pathology'].isin(['BENIGN', 'MALIGNANT'])]
print(f"Matched new DICOMs: {len(df_new_final)}")
df_new_final.head()
Matched new DICOMs: 344
| image_path | patient_id | side | view | pathology | |
|---|---|---|---|---|---|
| 0 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_00467 | RIGHT | MLO | MALIGNANT |
| 1 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_00445 | LEFT | MLO | MALIGNANT |
| 2 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_00243 | LEFT | CC | MALIGNANT |
| 3 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_00377 | LEFT | MLO | BENIGN |
| 4 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_00297 | RIGHT | CC | BENIGN |
df_combined = pd.concat([df_final, df_new_final], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Total dataset size after merge: {len(df_combined)}")
df_combined.head()
Total dataset size after merge: 451
| image_path | patient_id | side | view | pathology | |
|---|---|---|---|---|---|
| 0 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_01004 | LEFT | CC | BENIGN |
| 1 | ../data/manifest-1747932126245/CBIS-DDSM/Calc-... | P_00663 | RIGHT | MLO | BENIGN |
| 2 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_01022 | LEFT | MLO | BENIGN |
| 3 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_00005 | RIGHT | MLO | MALIGNANT |
| 4 | ../data/manifest-1747944277650/CBIS-DDSM/Calc-... | P_00049 | RIGHT | MLO | MALIGNANT |
import pydicom
import cv2
import numpy as np
from tqdm import tqdm
IMG_SIZE = 224
X = []
y = []
for _, row in tqdm(df_combined.iterrows(), total=len(df_combined)):
try:
dcm = pydicom.dcmread(row['image_path'])
img = dcm.pixel_array.astype(np.float32)
img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
img = img / 255.0
img = np.expand_dims(img, axis=-1)
X.append(img)
y.append(row['pathology'])
except Exception as e:
print(f"Error loading image: {row['image_path']}\n{e}")
X = np.array(X)
y = np.array(y)
print(f"X shape: {X.shape}, Y shape: {y.shape}")
100%|█████████████████████████████████████████| 451/451 [00:13<00:00, 32.57it/s]
X shape: (451, 224, 224, 1), Y shape: (451,)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le = LabelEncoder()
y_encoded = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
Train size: 360, Test size: 91
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
model = Sequential([
Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),
MaxPooling2D(pool_size=(2, 2)),
Conv2D(64, (3, 3), activation='relu'),
MaxPooling2D(pool_size=(2, 2)),
Dropout(0.25),
Flatten(),
Dense(64, activation='relu'),
Dropout(0.5),
Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
X_train, y_train,
validation_data=(X_test, y_test),
epochs=20,
batch_size=8,
callbacks=[early_stop]
)
Epoch 1/20
/Users/hubertsimonbom/anaconda3/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 105ms/step - accuracy: 0.5406 - loss: 154.5802 - val_accuracy: 0.7582 - val_loss: 0.5514 Epoch 2/20 45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.8028 - loss: 0.4685 - val_accuracy: 0.7473 - val_loss: 0.5173 Epoch 3/20 45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.8554 - loss: 0.2990 - val_accuracy: 0.8132 - val_loss: 0.4883 Epoch 4/20 45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.9445 - loss: 0.1448 - val_accuracy: 0.7912 - val_loss: 0.5455 Epoch 5/20 45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 103ms/step - accuracy: 0.9773 - loss: 0.1115 - val_accuracy: 0.8242 - val_loss: 0.5814 Epoch 6/20 45/45 ━━━━━━━━━━━━━━━━━━━━ 5s 104ms/step - accuracy: 0.9909 - loss: 0.0599 - val_accuracy: 0.8352 - val_loss: 0.6627
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()
from sklearn.metrics import confusion_matrix, classification_report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BENIGN', 'MALIGNANT']))
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 90ms/step Confusion Matrix: [[42 7] [10 32]] Classification Report: precision recall f1-score support BENIGN 0.81 0.86 0.83 49 MALIGNANT 0.82 0.76 0.79 42 accuracy 0.81 91 macro avg 0.81 0.81 0.81 91 weighted avg 0.81 0.81 0.81 91
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import random
num_samples = 6
indices = random.sample(range(len(X_test)), num_samples)
plt.figure(figsize=(15, 10))
for i, idx in enumerate(indices):
img = X_test[idx].squeeze()
true_label = y_test[idx]
pred_prob = model.predict(X_test[idx:idx+1])[0][0]
pred_label = int(pred_prob > 0.5)
plt.subplot(2, 3, i+1)
plt.imshow(img, cmap='gray')
plt.title(f"True: {'MALIGNANT' if true_label else 'BENIGN'}\nPred: {'MALIGNANT' if pred_label else 'BENIGN'} ({pred_prob:.2f})")
plt.axis('off')
plt.tight_layout()
plt.show()
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step