import gzip
import tarfile
from pathlib import Path
import anndata as ad
import pandas as pd
import requests
import scipy.io as sio
from scipy import sparse
dataDir = Path("data")
dataDir.mkdir(exist_ok=True)
GEOAcc = "GSE125708"
# Download the data from GEO
url = f"https://www.ncbi.nlm.nih.gov/geo/download/?acc={GEOAcc}&format=file"
if not (dataDir / f"{GEOAcc}.tar").exists():
print("Downloading data...")
r = requests.get(url, allow_redirects=True)
(dataDir / f"{GEOAcc}.tar").write_bytes(r.content)
Downloading data...
# Extract the tar file
with tarfile.open(dataDir / f"{GEOAcc}.tar") as tar:
tar.extractall(dataDir)
# gunzip the .gz files in dataDir
for f in dataDir.glob("*.gz"):
with gzip.open(f, "rb") as gz:
(dataDir / f.stem).write_bytes(gz.read())
f.unlink()
# Loop through all the '*.txt' files in dataDir and create a list of annData objects from their contentssamples = {}
samples = {}
for file in dataDir.glob("*.txt"):
print("Reading in " + file.stem)
names = file.stem.split("_")
# Format data such that gene expression is in columns
df = pd.read_csv(file, delimiter=" ", header=[0]).T
df.rename(columns=df.iloc[0], inplace=True)
df.drop(df.index[0], inplace=True)
df.index = pd.Index([names[0]], dtype="object")
df = df.astype("float32")
adata = ad.AnnData(
X=sparse.csr_matrix(df.values),
obs=pd.DataFrame(index=df.index),
var=pd.DataFrame(index=df.columns),
)
adata.obs["sample"] = names[1]
if names[1].startswith("A"):
adata.obs["tissue"] = "retina"
elif names[1].startswith("J"):
adata.obs["tissue"] = "whole_brain"
else:
raise ValueError("Unknown tissue type")
# get basename of the file after removing .txt
samples[file.stem] = adata
Reading in GSM3580729_AR39_MC_WT_R2_transcript_abundances Reading in GSM3580739_JH043_whole_brain_48h_hypoxia_R1_transcript_abundances Reading in GSM3580740_JH044_whole_brain_48h_hypoxia_R2_transcript_abundances Reading in GSM3580734_AR83_retina_Ndp_R1_transcript_abundances Reading in GSM3580742_JH046_whole_brain_7d_hypoxia_R1_transcript_abundances Reading in GSM3580736_JH040_whole_brain_normoxia_R1_transcript_abundances Reading in GSM3580738_JH042_whole_brain_normoxia_R3_transcript_abundances Reading in GSM3580744_JH048_whole_brain_7d_hypoxia_R3_transcript_abundances Reading in GSM3580731_AR53_retina_WT_R2_transcript_abundances Reading in GSM3580743_JH047_whole_brain_7d_hypoxia_R2_transcript_abundances Reading in GSM3580737_JH041_whole_brain_normoxia_R2_transcript_abundances Reading in GSM3580733_AR60_MC_Ndp_R2_transcript_abundances Reading in GSM3580735_AR84_retina_Ndp_R2_transcript_abundances Reading in GSM3580732_AR58_MC_Ndp_R1_transcript_abundances Reading in GSM3580741_JH045_whole_brain_48h_hypoxia_R3_transcript_abundances Reading in GSM3580730_AR49_retina_WT_R1_transcript_abundances Reading in GSM3580728_AR18_MC_WT_R1_transcript_abundances
df = ad.concat(samples.values(), merge="same")
# Write the expression matrix to a matrix market file
sio.mmwrite(dataDir / "GSE129788.mtx", df.X)
# Write the obs to a csv file
df.obs.to_csv(dataDir / "GSE129788_cell_info.csv")
# Write the var to a csv file
df.var.to_csv(dataDir / "GSE129788_gene_info.csv")