gsp1000

`lacuna.utils.gsp1000` ¶

Utilities for working with GSP1000 connectome data.

This module provides functions to convert GSP1000 functional data into optimized HDF5 batch files for efficient lesion network mapping.

`create_connectome_batches(gsp_dir, mask_path, output_dir, subjects_per_batch=50, pattern='sub-/func/bld001_rest_*_finalmask.nii.gz', verbose=False)` ¶

Create HDF5 batch files from GSP1000 functional data.

Scans a directory of functional NIfTI files, extracts time-series from within a brain mask, and saves the data into multiple smaller HDF5 batch files optimized for memory-efficient lesion network mapping.

Parameters:

Name	Type	Description	Default
`gsp_dir`	`str or Path`	Directory containing GSP1000 subject functional data. Expected structure: sub-/func/bld001_rest_*_finalmask.nii.gz	required
`mask_path`	`str or Path`	Path to brain mask NIfTI file (e.g., MNI152_T1_2mm_Brain_Mask.nii.gz). Defines which voxels to extract.	required
`output_dir`	`str or Path`	Directory where HDF5 batch files will be saved.	required
`subjects_per_batch`	`int`	Number of subjects to include in each batch file. Larger batches = fewer files but more memory per batch.	`50`
`pattern`	`str`	Glob pattern to find functional files within gsp_dir. Default matches standard GSP1000 structure.	`'sub-/func/bld001_rest_*_finalmask.nii.gz'`
`verbose`	`bool`	Print progress information.	`True`

Returns:

Type	Description
`list of Path`	Paths to created HDF5 batch files, sorted by name.

Notes

Each HDF5 batch file contains: - 'timeseries': (n_subjects, n_timepoints, n_voxels) float32 array - 'mask_indices': (3, n_voxels) array of mask coordinates - 'mask_affine': (4, 4) affine transformation matrix - Attributes: n_subjects, n_timepoints, n_voxels, mask_shape

The batch files are designed for sequential loading during analysis, minimizing memory footprint while maintaining processing speed.

Examples:

>>> from lacuna.utils.gsp1000 import create_connectome_batches
>>> batch_files = create_connectome_batches(
...     gsp_dir="/data/GSP1000",
...     mask_path="/data/MNI152_T1_2mm_Brain_Mask.nii.gz",
...     output_dir="/data/connectomes/gsp1000_batches",
...     subjects_per_batch=100
... )
>>> print(f"Created {len(batch_files)} batch files")

Source code in src/lacuna/utils/gsp1000.py

def create_connectome_batches(
    gsp_dir: str | Path,
    mask_path: str | Path,
    output_dir: str | Path,
    subjects_per_batch: int = 50,
    pattern: str = "sub-*/func/*bld001_rest_*_finalmask.nii.gz",
    verbose: bool = False,
) -> list[Path]:
    """Create HDF5 batch files from GSP1000 functional data.

    Scans a directory of functional NIfTI files, extracts time-series from
    within a brain mask, and saves the data into multiple smaller HDF5 batch
    files optimized for memory-efficient lesion network mapping.

    Parameters
    ----------
    gsp_dir : str or Path
        Directory containing GSP1000 subject functional data.
        Expected structure: sub-*/func/*bld001_rest_*_finalmask.nii.gz
    mask_path : str or Path
        Path to brain mask NIfTI file (e.g., MNI152_T1_2mm_Brain_Mask.nii.gz).
        Defines which voxels to extract.
    output_dir : str or Path
        Directory where HDF5 batch files will be saved.
    subjects_per_batch : int, default=50
        Number of subjects to include in each batch file.
        Larger batches = fewer files but more memory per batch.
    pattern : str, optional
        Glob pattern to find functional files within gsp_dir.
        Default matches standard GSP1000 structure.
    verbose : bool, default=True
        Print progress information.

    Returns
    -------
    list of Path
        Paths to created HDF5 batch files, sorted by name.

    Notes
    -----
    Each HDF5 batch file contains:
    - 'timeseries': (n_subjects, n_timepoints, n_voxels) float32 array
    - 'mask_indices': (3, n_voxels) array of mask coordinates
    - 'mask_affine': (4, 4) affine transformation matrix
    - Attributes: n_subjects, n_timepoints, n_voxels, mask_shape

    The batch files are designed for sequential loading during analysis,
    minimizing memory footprint while maintaining processing speed.

    Examples
    --------
    >>> from lacuna.utils.gsp1000 import create_connectome_batches
    >>> batch_files = create_connectome_batches(
    ...     gsp_dir="/data/GSP1000",
    ...     mask_path="/data/MNI152_T1_2mm_Brain_Mask.nii.gz",
    ...     output_dir="/data/connectomes/gsp1000_batches",
    ...     subjects_per_batch=100
    ... )
    >>> print(f"Created {len(batch_files)} batch files")
    """
    gsp_dir = Path(gsp_dir)
    mask_path = Path(mask_path)
    output_dir = Path(output_dir)

    if verbose:
        print("🚀 Creating connectome batch files from GSP1000 data...")

    # 1. Find all functional NIfTI files
    search_pattern = str(gsp_dir / pattern)
    all_subject_files = sorted(glob.glob(search_pattern))

    if not all_subject_files:
        raise FileNotFoundError(
            f"No NIfTI files found matching pattern: {search_pattern}\n"
            f"Expected structure: {gsp_dir}/sub-*/func/*bld001_rest_*_finalmask.nii.gz"
        )

    n_total_subjects = len(all_subject_files)
    if verbose:
        print(f"✓ Found {n_total_subjects} subject files")

    # 2. Load brain mask and extract metadata
    if verbose:
        print(f"Loading brain mask: {mask_path}")
    mask_img = nib.load(mask_path)
    mask_data = mask_img.get_fdata().astype(bool)
    mask_affine = mask_img.affine
    in_mask_indices = np.where(mask_data)
    n_voxels = len(in_mask_indices[0])

    # Get number of timepoints from first subject
    first_img = nib.load(all_subject_files[0])
    n_timepoints = first_img.shape[3]

    if verbose:
        print(f"✓ Mask contains {n_voxels:,} in-brain voxels")
        print(f"✓ Detected {n_timepoints} timepoints per subject")

    # 3. Split subjects into batches
    subject_batches = [
        all_subject_files[i : i + subjects_per_batch]
        for i in range(0, n_total_subjects, subjects_per_batch)
    ]
    n_batches = len(subject_batches)

    if verbose:
        print(f"✓ Will create {n_batches} batch files ({subjects_per_batch} subjects/batch)")

    output_dir.mkdir(parents=True, exist_ok=True)
    created_files = []

    # 4. Process each batch
    progress_iter = tqdm(subject_batches, desc="Creating batches") if verbose else subject_batches

    for batch_idx, batch_files in enumerate(progress_iter):
        batch_filename = output_dir / f"connectome_batch_{batch_idx:03d}.h5"
        n_subjects_in_batch = len(batch_files)

        with h5py.File(batch_filename, "w") as hf:
            # Create dataset for this batch
            timeseries_dset = hf.create_dataset(
                "timeseries",
                shape=(n_subjects_in_batch, n_timepoints, n_voxels),
                dtype=np.float32,
                chunks=(1, n_timepoints, n_voxels),
                compression="gzip",
                compression_opts=1,  # Fast compression
            )

            # Store metadata (makes each batch self-contained)
            hf.create_dataset("mask_indices", data=np.vstack(in_mask_indices))
            hf.create_dataset("mask_affine", data=mask_affine)
            hf.attrs["n_subjects"] = n_subjects_in_batch
            hf.attrs["n_timepoints"] = n_timepoints
            hf.attrs["n_voxels"] = n_voxels
            hf.attrs["mask_shape"] = mask_data.shape

            # Process subjects in current batch
            for subj_idx, func_path in enumerate(batch_files):
                func_img = nib.load(func_path)
                func_data = func_img.get_fdata()

                # Extract in-mask voxels and transpose to (timepoints, voxels)
                subject_timeseries = func_data[in_mask_indices].T

                # Save to HDF5
                timeseries_dset[subj_idx, :, :] = subject_timeseries

        created_files.append(batch_filename)

    if verbose:
        print(f"\n✅ Created {len(created_files)} batch files in {output_dir}")
        total_size_mb = sum(f.stat().st_size for f in created_files) / (1024**2)
        print(f"✓ Total size: {total_size_mb:.1f} MB")

    return created_files

`validate_connectome_batches(batch_dir, verbose=False)` ¶

Validate integrity of HDF5 connectome batch files.