Dataset information


source

MedDataset


def MedDataset(
    dataframe:NoneType=None, image_col:str=None, mask_col:str='mask_path', path:NoneType=None,
    img_list:NoneType=None, postfix:str='', apply_reorder:bool=True,
    dtype:(<class 'fastMONAI.vision_core.MedImage'>, <class 'fastMONAI.vision_core.MedMask'>)=MedImage,
    max_workers:int=1, use_cache:bool=True, cache_path:NoneType=None
):

A class to extract and present information about the dataset.


source

suggest_patch_size


def suggest_patch_size(
    dataset:MedDataset, target_spacing:list=None, min_patch_size:list=None, max_patch_size:list=None, divisor:int=16
)->list:

Suggest optimal patch size based on dataset dimensions.

Uses median shape as the starting point but clamps to the minimum volume size per axis, ensuring the suggested patch fits ALL volumes without requiring padding during training.

Algorithm: 1. Use min(median, min_volume) per axis for safety 2. Round down to nearest multiple of divisor (16 for UNet compatibility) 3. Clamp to [min_patch_size, max_patch_size] bounds 4. Validate: error if min_patch_size exceeds smallest volume

Args: dataset: MedDataset instance with analyzed images. target_spacing: Target voxel spacing [x, y, z]. If None, uses dataset.get_suggestion()[‘target_spacing’]. min_patch_size: Minimum per dimension. Default [32, 32, 32]. max_patch_size: Maximum per dimension. Default [256, 256, 256]. divisor: Ensure divisibility (default 16 for UNet compatibility).

Returns: list: [patch_dim_0, patch_dim_1, patch_dim_2]

Example: >>> from fastMONAI.dataset_info import MedDataset >>> dataset = MedDataset(dataframe=df, mask_col=‘mask_path’, dtype=MedMask) >>> >>> # Use recommended spacing >>> patch_size = suggest_patch_size(dataset) >>> >>> # Use custom spacing >>> patch_size = suggest_patch_size(dataset, target_spacing=[1.0, 1.0, 2.0])


source

preprocess_dataset


def preprocess_dataset(
    df, img_col, mask_col:NoneType=None, output_dir:str='preprocessed', target_spacing:NoneType=None,
    apply_reorder:bool=True, transforms:NoneType=None, max_workers:int=4, skip_existing:bool=True
):

Preprocess dataset to disk, creating new columns for preprocessed paths.

Processes images (and optionally masks) through a transform pipeline, saves to output_dir, then creates new ’{col}_preprocessed’ columns in the DataFrame. Original columns are preserved unchanged.

Transform pipeline order: CopyAffine (if masks) -> ToCanonical (if apply_reorder) -> Resample (if target_spacing) -> user transforms

Args: df: DataFrame with file paths. img_col: Column name for image paths. mask_col: Optional column name for mask paths. output_dir: Output directory. Creates images/ and masks/ subdirectories. target_spacing: Target voxel spacing for resampling (e.g., [1.0, 1.0, 1.0]). apply_reorder: Whether to reorder to RAS+ canonical orientation. transforms: Additional TorchIO or fastMONAI transforms to apply after reordering and resampling. max_workers: Number of parallel workers. Each worker loads a full 3D volume into memory, so reduce for large volumes. skip_existing: Skip files that already exist on disk (with size > 0).

import tempfile, shutil
from fastcore.test import test_eq, test_fail

_tmp = tempfile.mkdtemp()

# Create synthetic NIfTI files
for i in range(3):
    tio.ScalarImage(tensor=torch.randn(1, 10, 10, 10)).save(f'{_tmp}/img_{i}.nii.gz')
    tio.LabelMap(tensor=torch.randint(0, 2, (1, 10, 10, 10))).save(f'{_tmp}/mask_{i}.nii.gz')

# Test 1: Image-only preprocessing (new columns, originals preserved)
_df1 = pd.DataFrame({'img': [f'{_tmp}/img_{i}.nii.gz' for i in range(3)]})
_orig_paths1 = _df1['img'].tolist()
_out1 = f'{_tmp}/out1'
preprocess_dataset(_df1, img_col='img', output_dir=_out1, apply_reorder=False)
# Original column preserved
test_eq(_df1['img'].tolist(), _orig_paths1)
# New preprocessed column created
test_eq('img_preprocessed' in _df1.columns, True)
test_eq(all(Path(p).exists() for p in _df1['img_preprocessed']), True)
test_eq(all('out1/images/' in p for p in _df1['img_preprocessed']), True)

# Test 2: Skip-existing (rerun with original paths pointing to same filenames)
_df2 = pd.DataFrame({'img': [f'{_tmp}/img_{i}.nii.gz' for i in range(3)]})
preprocess_dataset(_df2, img_col='img', output_dir=_out1, apply_reorder=False)
# Should print "0 processed, 3 skipped"

# Test 3: With masks (both columns preserved, new columns created)
_df3 = pd.DataFrame({
    'img': [f'{_tmp}/img_{i}.nii.gz' for i in range(3)],
    'mask': [f'{_tmp}/mask_{i}.nii.gz' for i in range(3)],
})
_orig_img3 = _df3['img'].tolist()
_orig_mask3 = _df3['mask'].tolist()
_out3 = f'{_tmp}/out3'
preprocess_dataset(_df3, img_col='img', mask_col='mask', output_dir=_out3, apply_reorder=False)
# Original columns preserved
test_eq(_df3['img'].tolist(), _orig_img3)
test_eq(_df3['mask'].tolist(), _orig_mask3)
# New preprocessed columns created
test_eq(all(Path(p).exists() for p in _df3['img_preprocessed']), True)
test_eq(all(Path(p).exists() for p in _df3['mask_preprocessed']), True)
test_eq(all('out3/masks/' in p for p in _df3['mask_preprocessed']), True)

# Test 4: Input validation
test_fail(lambda: preprocess_dataset(pd.DataFrame(), img_col='img'), contains='empty')
test_fail(lambda: preprocess_dataset(pd.DataFrame({'x': [1]}), img_col='img'), contains='not found')
_df_dup = pd.DataFrame({'img': [f'{_tmp}/img_0.nii.gz', f'{_tmp}/img_0.nii.gz']})
test_fail(lambda: preprocess_dataset(_df_dup, img_col='img'), contains='Duplicate')

shutil.rmtree(_tmp)
Preprocessing complete: 3 processed, 0 skipped, 0 failed
Preprocessing complete: 0 processed, 3 skipped, 0 failed
Preprocessing complete: 3 processed, 0 skipped, 0 failed
Preprocessing:   0%|          | 0/3 [00:00<?, ?it/s]Preprocessing: 100%|##########| 3/3 [00:00<00:00, 746.32it/s]
Preprocessing:   0%|          | 0/3 [00:00<?, ?it/s]Preprocessing: 100%|##########| 3/3 [00:00<00:00, 216.48it/s]

source

get_class_weights


def get_class_weights(
    labels:(<built-in function array>, <class 'list'>), class_weight:str='balanced'
)->Tensor:

Calculates and returns the class weights.

Args: labels: An array or list of class labels for each instance in the dataset. class_weight: Defaults to ‘balanced’.

Returns: A tensor of class weights.