Cropping the data

Cropping the data#

## The following code ensures that all functions and init files are reloaded before executions.
%load_ext autoreload
%autoreload 2

from pathlib import Path
from insitupy import InSituData, CACHE

Load Xenium data into `InSituData` object#

Now the Xenium data can be parsed by providing the data path to the InSituPy project folder

insitupy_project = Path(CACHE / "out/demo_insitupy_project")
xd = InSituData.read(insitupy_project)

xd

InSituData
Method:		Xenium
Slide ID:	0001879
Sample ID:	Replicate 1
Path:		C:\Users\ge37voy\.cache\InSituPy\out\demo_insitupy_project


No modalities loaded.

# read all data modalities but the transcripts
# xd.load_all(skip="transcripts")
xd.load_all()

xd

InSituData
Method:		Xenium
Slide ID:	0001879
Sample ID:	Replicate 1
Path:		C:\Users\ge37voy\.cache\InSituPy\out\demo_insitupy_project

    ➤ images
       'CD20':     (25778, 35416)
       'HE':       (25778, 35416, 3)
       'HER2':     (25778, 35416)
       'nuclei':   (25778, 35416)
    ➤ cells
       MultiCellData with main layer 'main'
           table
               AnnData object with n_obs × n_vars = 157600 × 297
               obs: 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'n_genes_by_counts', 'n_genes', 'leiden'
               var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
               uns: 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
               obsm: 'X_pca', 'X_umap', 'annotations', 'regions', 'spatial'
               varm: 'PCs'
               layers: 'counts', 'norm_counts'
               obsp: 'connectivities', 'distances'
           boundaries
               BoundariesData object with 2 entries:
                   cells
                   nuclei
    ➤ annotations
       demo:	4 annotations, 2 classes ('Negative', 'Positive')
       demo2:	5 annotations, 3 classes ('Negative', 'Other', 'Positive')
       demo3:	7 annotations, 5 classes ('Immune cells', 'Necrosis', 'Stroma', 'Tumor', 'unclassified')
       TestKey:	9 annotations, 2 classes ('TestClass', 'points')
    ➤ regions
       demo_regions:	3 regions, 3 classes ('Region1', 'Region2', 'Region3')
       TMA:	6 regions, 6 classes ('A-1', 'A-2', 'A-3', 'B-1', 'B-2', 'B-3')
    ➤ transcripts
       DataFrame with shape <dask_expr.expr.Scalar: expr=ReadParquetFSSpec(a817282).size() // 8, dtype=int64> x 8

# Visualize the data
xd.show()

2026-02-23 22:58:48 | [INFO] Extracting unique gene names from Dask DataFrame...
2026-02-23 22:58:50 | [INFO] Found 541 unique genes

Cropping of data#

There are two different methods implemented for cropping the data.

Option 1: Crop using limit values#

# alternatively you can also crop using the xlim/ylim arguments
xd_cropped = xd.crop(xlim=(2000,3000), ylim=(2000,3000))

xd_cropped

InSituData
Method:		Xenium
Slide ID:	0001879
Sample ID:	Replicate 1
Path:		None

    ➤ images
       'CD20':     (4706, 4706)
       'HE':       (4706, 4706, 3)
       'HER2':     (4706, 4706)
       'nuclei':   (4706, 4706)
    ➤ cells
       MultiCellData with main layer 'main'
           table
               AnnData object with n_obs × n_vars = 4550 × 297
               obs: 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'n_genes_by_counts', 'n_genes', 'leiden'
               var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
               uns: 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
               obsm: 'X_pca', 'X_umap', 'annotations', 'regions', 'spatial'
               varm: 'PCs'
               layers: 'counts', 'norm_counts'
               obsp: 'connectivities', 'distances'
           boundaries
               BoundariesData object with 2 entries:
                   cells
                   nuclei
    ➤ annotations
       demo:	2 annotations, 1 class ('Positive')
       demo3:	1 annotations, 1 class ('Stroma')
       TestKey:	1 annotations, 1 class ('TestClass')
    ➤ regions
       demo_regions:	1 regions, 1 class ('Region3')
    ➤ transcripts
       DataFrame with shape <dask_expr.expr.Scalar: expr=(Assign(frame=Assign(frame=Loc(frame=ReadParquetFSSpec(a817282)[['transcript_id', 'cell_id', 'overlaps_nucleus', 'feature_name', 'x_location', 'y_location', 'z_location', 'qv']], iindexer=ReadParquetFSSpec(a817282)['x_location'] >= 2000 & ReadParquetFSSpec(a817282)['x_location'] <= 3000 & ReadParquetFSSpec(a817282)['y_location'] >= 2000 & ReadParquetFSSpec(a817282)['y_location'] <= 3000)))).size() // 8, dtype=int64> x 8

xd_cropped.show()

2026-02-23 22:53:03 | [INFO] Extracting unique gene names from Dask DataFrame...
2026-02-23 22:53:07 | [INFO] Found 541 unique genes
2026-02-23 22:53:21 | [INFO] Loading coordinates for gene 'ACTA2'...
2026-02-23 22:53:23 | [INFO] Loaded 15270 coordinates for gene 'ACTA2'
2026-02-23 22:53:42 | [INFO] Loading coordinates for gene 'BANK1'...
2026-02-23 22:53:44 | [INFO] Loaded 856 coordinates for gene 'BANK1'
WARNING: Gene 'cd74' not found.
2026-02-23 22:54:07 | [INFO] Loading coordinates for gene 'CD4'...
2026-02-23 22:54:09 | [INFO] Loaded 5700 coordinates for gene 'CD4'

Option 2: Crop from `regions`#

We can also crop a region from the dataset. To specify the region, a tuple in the shape (region_key, region_name) is used.

xd_cropped = xd.crop(
    region_tuple=("demo_regions", "Region1"))

xd_cropped

InSituData
Method:		Xenium
Slide ID:	0001879
Sample ID:	Replicate 1
Path:		None

    ➤ images
       'CD20':     (2701, 3309)
       'HE':       (2701, 3309, 3)
       'HER2':     (2701, 3309)
       'nuclei':   (2701, 3309)
    ➤ cells
       MultiCellData with main layer 'main'
           table
               AnnData object with n_obs × n_vars = 2289 × 297
               obs: 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'n_genes_by_counts', 'n_genes', 'leiden'
               var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
               uns: 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
               obsm: 'X_pca', 'X_umap', 'annotations', 'regions', 'spatial'
               varm: 'PCs'
               layers: 'counts', 'norm_counts'
               obsp: 'connectivities', 'distances'
           boundaries
               BoundariesData object with 2 entries:
                   cells
                   nuclei
    ➤ annotations
       TestKey:	5 annotations, 1 class ('points')
    ➤ regions
       demo_regions:	1 regions, 1 class ('Region1')
    ➤ transcripts
       DataFrame with shape <dask_expr.expr.Scalar: expr=(Assign(frame=Assign(frame=Loc(frame=ReadParquetFSSpec(a817282)[['transcript_id', 'cell_id', 'overlaps_nucleus', 'feature_name', 'x_location', 'y_location', 'z_location', 'qv']], iindexer=UFunc(within))))).size() // 8, dtype=int64> x 8

Saving the cropped data#

Visualizing cropped data might lead to performance issues. It is therefore recommended, to first save and reload the cropped data before continuing.

Saving to the existing project path is not possible#

Due to the cropping event, saving to the existing project path is not possible and the .save() function throws an error:

xd_cropped.save()

Reload also does not work because it was not saved as an InSituPy project.

xd_cropped.reload()

No modalities with existing save path found. Consider saving the data with `saveas()` first.

Saving to new project directory#

cropped_insitupy_project = insitupy_project.parent / f"{insitupy_project.name}_cropped"

xd_cropped.saveas(cropped_insitupy_project, overwrite=True)

Saving data to C:\Users\ge37voy\.cache\InSituPy\out\demo_insitupy_project_cropped
Saved.

Reload from `InSituPy` project folder#

Reloading from project folder makes visualizations more efficient. But of course only the modalities that had been loaded before the cropping event can be reloaded in this step.

# reload from insitupy project
xd_cropped = InSituData.read(cropped_insitupy_project)
xd_cropped.load_all()

xd_cropped

InSituData
Method:		Xenium
Slide ID:	0001879
Sample ID:	Replicate 1
Path:		C:\Users\ge37voy\.cache\InSituPy\out\demo_insitupy_project_cropped

    ➤ images
       'CD20':     (2701, 3309)
       'HE':       (2701, 3309, 3)
       'HER2':     (2701, 3309)
       'nuclei':   (2701, 3309)
    ➤ cells
       MultiCellData with main layer 'main'
           table
               AnnData object with n_obs × n_vars = 2289 × 297
               obs: 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'n_genes_by_counts', 'n_genes', 'leiden'
               var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
               uns: 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
               obsm: 'X_pca', 'X_umap', 'annotations', 'regions', 'spatial'
               varm: 'PCs'
               layers: 'counts', 'norm_counts'
               obsp: 'connectivities', 'distances'
           boundaries
               BoundariesData object with 2 entries:
                   cells
                   nuclei
    ➤ annotations
       TestKey:	5 annotations, 1 class ('points')
    ➤ regions
       demo_regions:	1 regions, 1 class ('Region1')
    ➤ transcripts
       DataFrame with shape <dask_expr.expr.Scalar: expr=ReadParquetFSSpec(5291463).size() // 8, dtype=int64> x 8

xd_cropped.show()

2026-02-23 23:01:11 | [INFO] Extracting unique gene names from Dask DataFrame...
2026-02-23 23:01:11 | [INFO] Found 536 unique genes
2026-02-23 23:01:27 | [INFO] Loading coordinates for gene 'ACTA2'...
2026-02-23 23:01:27 | [INFO] Loaded 5843 coordinates for gene 'ACTA2'