Query individual files#
Here, weβll query individual files and inspect their metadata.
This guide can be skipped if you are only interested in how to leverage the overall dataset.
import lamindb as ln
import lnschema_bionty as lb
import anndata as ad
π‘ lamindb instance: testuser1/test-scrna
ln.track()
π‘ notebook imports: anndata==0.9.2 lamindb==0.61.0 lnschema_bionty==0.35.1
π‘ saved: Transform(uid='agayZTonayqAz8', name='Query individual files', short_name='scrna3', version='0', type=notebook, updated_at=2023-11-20 19:13:30 UTC, created_by_id=1)
π‘ saved: Run(uid='VbfQiFDcsiUPOmxeOyV8', run_at=2023-11-20 19:13:30 UTC, transform_id=3, created_by_id=1)
Query files by provenance metadata#
users = ln.User.lookup()
ln.Transform.filter(created_by=users.testuser1).search("scrna")
uid | score | |
---|---|---|
name | ||
scRNA-seq | Nv48yAceNSh8z8 | 90.0 |
Append a new batch of data | ManDYgmftZ8Cz8 | 36.0 |
Query individual files | agayZTonayqAz8 | 36.0 |
transform = ln.Transform.filter(uid="Nv48yAceNSh8z8").one()
ln.File.filter(transform=transform).df()
uid | storage_id | key | suffix | accessor | description | version | size | hash | hash_type | transform_id | run_id | initial_version_id | visibility | key_is_virtual | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||
1 | 7cZL0wUTAhbYPoRkwJ94 | 1 | scrna/conde22.h5ad | .h5ad | AnnData | Human immune cells from Conde22 | None | 57612943 | 9sXda5E7BYiVoDOQkTC0KB | sha1-fl | 1 | 1 | None | 0 | True | 2023-11-20 19:12:55.749681+00:00 | 1 |
Query files by biological metadata#
assays = lb.ExperimentalFactor.lookup()
organism = lb.Organism.lookup()
cell_types = lb.CellType.lookup()
query = ln.File.filter(
experimental_factors=assays.single_cell_rna_sequencing,
organism=organism.human,
cell_types=cell_types.gamma_delta_t_cell,
)
query.df()
uid | storage_id | key | suffix | accessor | description | version | size | hash | hash_type | transform_id | run_id | initial_version_id | visibility | key_is_virtual | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||
1 | 7cZL0wUTAhbYPoRkwJ94 | 1 | scrna/conde22.h5ad | .h5ad | AnnData | Human immune cells from Conde22 | None | 57612943 | 9sXda5E7BYiVoDOQkTC0KB | sha1-fl | 1 | 1 | None | 0 | True | 2023-11-20 19:12:55.749681+00:00 | 1 |
Inspect file metadata#
query_set = ln.File.filter().all()
file1, file2 = query_set[0], query_set[1]
file1.describe()
File(uid='7cZL0wUTAhbYPoRkwJ94', key='scrna/conde22.h5ad', suffix='.h5ad', accessor='AnnData', description='Human immune cells from Conde22', size=57612943, hash='9sXda5E7BYiVoDOQkTC0KB', hash_type='sha1-fl', visibility=0, key_is_virtual=True, updated_at=2023-11-20 19:12:55 UTC)
Provenance:
ποΈ storage: Storage(uid='1DHSaXuk', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-11-20 19:12:34 UTC, created_by_id=1)
π transform: Transform(uid='Nv48yAceNSh8z8', name='scRNA-seq', short_name='scrna', version='0', type='notebook', updated_at=2023-11-20 19:12:38 UTC, created_by_id=1)
π£ run: Run(uid='krXmpiuQnyAVVwpu9Rst', run_at=2023-11-20 19:12:38 UTC, transform_id=1, created_by_id=1)
π€ created_by: User(uid='DzTjkKse', handle='testuser1', name='Test User1', updated_at=2023-11-20 19:12:34 UTC)
β¬οΈ input_of (core.Run): ['2023-11-20 19:13:01 UTC']
Features:
var: FeatureSet(uid='BYD1KRzo3u4Y4DImNYZ4', n=36390, type='number', registry='bionty.Gene', hash='rMZltwoBCMdVPVR8x6nJ', updated_at=2023-11-20 19:12:52 UTC, created_by_id=1)
'MIR1302-2HG', 'FAM138A', 'OR4F5', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'OR4F29', 'None', 'OR4F16', 'None', 'LINC01409', 'FAM87B', 'LINC01128', 'LINC00115', 'FAM41C', 'None', ...
obs: FeatureSet(uid='TQSf0AMgXjOMz53oPxBi', n=4, registry='core.Feature', hash='5Nc89cKbUXM3R-6eoEru', updated_at=2023-11-20 19:12:53 UTC, created_by_id=1)
π cell_type (32, bionty.CellType): 'classical monocyte', 'T follicular helper cell', 'memory B cell', 'alveolar macrophage', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated', 'alpha-beta T cell', 'CD4-positive helper T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'macrophage', ...
π assay (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 3' v3', '10x 5' v2', '10x 5' v1'
π tissue (17, bionty.Tissue): 'blood', 'thoracic lymph node', 'spleen', 'lung', 'mesenteric lymph node', 'lamina propria', 'liver', 'jejunal epithelium', 'omentum', 'bone marrow', ...
π donor (12, core.ULabel): 'D496', '621B', 'A29', 'A36', 'A35', '637C', 'A52', 'A37', 'D503', '640C', ...
Labels:
π·οΈ organism (1, bionty.Organism): 'human'
π·οΈ tissues (17, bionty.Tissue): 'blood', 'thoracic lymph node', 'spleen', 'lung', 'mesenteric lymph node', 'lamina propria', 'liver', 'jejunal epithelium', 'omentum', 'bone marrow', ...
π·οΈ cell_types (32, bionty.CellType): 'classical monocyte', 'T follicular helper cell', 'memory B cell', 'alveolar macrophage', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated', 'alpha-beta T cell', 'CD4-positive helper T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'macrophage', ...
π·οΈ experimental_factors (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 3' v3', '10x 5' v2', '10x 5' v1'
π·οΈ ulabels (12, core.ULabel): 'D496', '621B', 'A29', 'A36', 'A35', '637C', 'A52', 'A37', 'D503', '640C', ...
file1.view_flow()
file2.describe()
File(uid='XypRi49vsjGVIctT8Wk9', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=857752, hash='SAuVZAKKM_Ypj_0SdrhDIg', hash_type='md5', visibility=0, key_is_virtual=True, updated_at=2023-11-20 19:13:22 UTC)
Provenance:
ποΈ storage: Storage(uid='1DHSaXuk', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-11-20 19:12:34 UTC, created_by_id=1)
π transform: Transform(uid='ManDYgmftZ8Cz8', name='Append a new batch of data', short_name='scrna2', version='0', type='notebook', updated_at=2023-11-20 19:13:01 UTC, created_by_id=1)
π£ run: Run(uid='8EPeNnTq2dVnrUtdV29O', run_at=2023-11-20 19:13:01 UTC, transform_id=2, created_by_id=1)
π€ created_by: User(uid='DzTjkKse', handle='testuser1', name='Test User1', updated_at=2023-11-20 19:12:34 UTC)
Features:
var: FeatureSet(uid='jkszan5WQdg1162ngfND', n=754, type='number', registry='bionty.Gene', hash='WMDxN7253SdzGwmznV5d', updated_at=2023-11-20 19:13:22 UTC, created_by_id=1)
'IL18', 'NPM3', 'S100A9', 'S100A8', 'CNN2', 'ARHGAP45', 'RNF34', 'GPX4', 'S100A6', 'ADISSP', 'S100A4', 'FAM174C', 'SIT1', 'CCDC107', 'RSL1D1', 'TLN1', 'HES4', 'TNFRSF17', 'PCNA', 'RAB13', ...
obs: FeatureSet(uid='RytvOstSocKHUjlGdwst', n=1, registry='core.Feature', hash='QiqpN9CN6RveZnr6y1FC', updated_at=2023-11-20 19:13:22 UTC, created_by_id=1)
π cell_type (9, bionty.CellType): 'dendritic cell', 'CD38-positive naive B cell', 'B cell, CD19-positive', 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated', 'CD16-positive, CD56-dim natural killer cell, human', 'CD4-positive, alpha-beta T cell', 'cytotoxic T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD14-positive, CD16-negative classical monocyte'
external: FeatureSet(uid='t2RaPTykMvqmedAbQDnV', n=2, registry='core.Feature', hash='mNZ755renGkdKW9rXQ_f', updated_at=2023-11-20 19:13:22 UTC, created_by_id=1)
π assay (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
π organism (1, bionty.Organism): 'human'
Labels:
π·οΈ organism (1, bionty.Organism): 'human'
π·οΈ cell_types (9, bionty.CellType): 'dendritic cell', 'CD38-positive naive B cell', 'B cell, CD19-positive', 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated', 'CD16-positive, CD56-dim natural killer cell, human', 'CD4-positive, alpha-beta T cell', 'cytotoxic T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD14-positive, CD16-negative classical monocyte'
π·οΈ experimental_factors (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
file2.view_flow()
Compare features#
Here we compute shared genes without loading files:
file1_genes = file1.features["var"]
file2_genes = file2.features["var"]
shared_genes = file1_genes & file2_genes
len(shared_genes)
749
shared_genes.list("symbol")[:10]
['HES4',
'TNFRSF4',
'SSU72',
'PARK7',
'RBP7',
'SRM',
'MAD2L2',
'AGTRAP',
'TNFRSF1B',
'EFHD2']
Compare cell types#
file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()
shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names
['CD16-positive, CD56-dim natural killer cell, human']
Load the individual files#
We could either load the files into memory or access them in backed
mode through .backed()
to lazily load their content from the cloud or the disk.display_markdown
Letβs load them into memory:
adata1 = file1.load()
adata2 = file2.load()
We can now subset the two datasets by shared cell types:
adata1_subset = adata1[adata1.obs["cell_type"].isin(shared_celltypes_names)]
adata2_subset = adata2[adata2.obs["cell_type"].isin(shared_celltypes_names)]