统一脑区命名
最近在做全脑跨物种比较涉及多个物种脑区信息在数据清洗过程中转换名称步骤比较复杂所以就整理了一个跨物种脑区统一转换表格# library(dplyr) # 0. 定义原始列名映射字典 col_mapping - c( A1 Final_Layer_Annotation, Amygdala celltype.V2, Clastrum subtype.V2, Hippocampus celltype.sub, Hypothalamus celltype.V1, Striatum celltype.V2, V1 Final_Layer_Annotation ) for (region in names(all_metadata_list)) { orig_col - col_mapping[[region]] # 【强制转换为字符串】防止因子的整数代码污染 cleaned_types - as.character(all_metadata_list[[region]][[orig_col]]) # # 第一阶段精确匹配长文本描述与特殊修正 # mapping_dict - c( Upper-layer intratelencephalic GLUT_UL_IT, Deep-layer intratelencephalic GLUT_DL_IT, Deep-layer corticothalamic and 6b GLUT_DL_CT6b, Deep-layer near-projecting GLUT_DL_NP, Amygdala excitatory GLUT_AMY, Mammillary body GLUT_MB, Midbrain-derived inhibitory GABA_MDI, Cholinergic_GABA GABA_Cho, Endothelial Vascular Vasc, Choroid plexus ChP, Lower rhombic lip LRL, Dopaminergic Neuron DA ) for (old_name in names(mapping_dict)) { cleaned_types - ifelse(cleaned_types old_name, mapping_dict[[old_name]], cleaned_types) } # # 第二阶段非神经元细胞Non-neurons缩写规范 # cleaned_types - gsub(Astrocyte, Astro, cleaned_types) cleaned_types - gsub(Oligodendrocyte, Oligo, cleaned_types) cleaned_types - gsub(Microglia, Micro, cleaned_types) cleaned_types - gsub(Vascular, Vasc, cleaned_types) cleaned_types - gsub(Ependymal, Epen, cleaned_types) # # 第三阶段纹状体 MSN 系列规范 # cleaned_types - gsub((?i)Eccentric medium spiny neuron, eMSN, cleaned_types, perl TRUE) cleaned_types - gsub((?i)Medium spiny neuron, MSN, cleaned_types, perl TRUE) # # 第四阶段抑制性神经元GABAergic规范 # cleaned_types - gsub(^Gaba_, GABA_, cleaned_types) cleaned_types - gsub(^(CGE|MGE|LGE)_, GABA_, cleaned_types) cleaned_types - gsub((CGE|MGE) interneuron, GABA_\\1, cleaned_types) cleaned_types - gsub((PVALB|VIP|SST|LAMP5|RELN) neurons, GABA_\\1, cleaned_types) cleaned_types - gsub(GABA_Vip, GABA_VIP, cleaned_types) # 处理 Chandelier 细胞 cleaned_types - gsub(PVALB Chandelier neurons|GABA_PVALB Chandelier, GABA_PV_Cha, cleaned_types) cleaned_types - gsub(LAMP5-LHX6 and Chandelier, GABA_LAMP5-LHX6_Cha, cleaned_types) # 全局将所有的 PVALB 简写为 PV (涵盖上文生成的 GABA_PVALB 等) cleaned_types - gsub(PVALB, PV, cleaned_types) # # 第五阶段兴奋性神经元Glutamatergic与海马体规范 # # 统一前缀为 GLUT_ (涵盖 Glut, Glu_) cleaned_types - gsub(Glut_?, GLUT_, cleaned_types, ignore.case TRUE) cleaned_types - gsub(Glu_, GLUT_, cleaned_types, ignore.case TRUE) # 精准捕获 Glu_SUB cleaned_types - gsub(GLUT_, GLUT_, cleaned_types) # 海马体特异性转换 cleaned_types - gsub(Hippocampal dentate gyrus, GLUT_DG, cleaned_types) cleaned_types - gsub(Hippocampal , GLUT_, cleaned_types) cleaned_types - gsub(GLUT_CA1-3, GLUT_CA, cleaned_types) # 简化 CA1-3 # 皮层层级转换 is_layer_neuron - grepl(^L[1-6], cleaned_types) if (any(is_layer_neuron)) { cleaned_types[is_layer_neuron] - paste0(GLUT_, cleaned_types[is_layer_neuron]) } cleaned_types - gsub(GLUT_L([0-6])[-/]L?([0-6]), GLUT_L\\1\\2, cleaned_types) cleaned_types - gsub(GLUT_L([0-9ab])[ -], GLUT_L\\1, cleaned_types) # # 第六阶段最终安全检查、符号清洗与扫尾 # cleaned_types - gsub(GABA_GABA, GABA_Unknown, cleaned_types) # 移除特殊符号以提高下游系统兼容性 cleaned_types - gsub(-, _, cleaned_types) # 全局将所有连字符转为下划线 cleaned_types - gsub(\\, , cleaned_types) # 全局移除所有加号 (例如 AVP, AVP) # 将清洗完的干净数据统一写入 Unified_CellType 列 all_metadata_list[[region]]$Unified_CellType - cleaned_types } cat(✅ 数据已按最新规范统一清洗\n)for (region in names(all_metadata_list)) { meta - all_metadata_list[[region]] meta - meta %% mutate( # # 划分 Level 3 (最精细的亚类映射作为基石) # Level_3 case_when( # 1. 神经胶质细胞 (原样继承或合并) Unified_CellType %in% c(Astro, Oligo, Micro, OPC, Vasc) ~ Unified_CellType, Unified_CellType %in% c(Epen, ChP) ~ Epen_ChP, Unified_CellType LRL ~ Progenitor_LRL, # 菱脑唇前体细胞 # 2. GABA: 按发育起源划分 # MGE 起源PV, SST, LHX6, SOX6 及枝状细胞 (Chandelier) grepl(PV|SST|LHX6|SOX6|MGE, Unified_CellType) grepl(GABA, Unified_CellType) ~ GABA_MGE, # CGE 起源VIP, LAMP5, RELN, CCK grepl(VIP|LAMP5|RELN|CCK|CGE, Unified_CellType) grepl(GABA, Unified_CellType) ~ GABA_CGE, # LGE 起源 (纹状体主要投射神经元)MSN 系列 grepl(MSN, Unified_CellType) ~ GABA_LGE_MSN, # 间脑/特异性区域 GABA (如下丘脑特异性抑制细胞) grepl(GABA, Unified_CellType) !grepl(PV|SST|LHX6|SOX6|MGE|VIP|LAMP5|RELN|CCK|CGE|MSN, Unified_CellType) ~ GABA_Other_Diencephalon, # 3. GLUT: 皮层按投射皮层下按区域 # 皮层 IT (Intratelencephalic) grepl(GLUT_.*IT$, Unified_CellType) ~ Cortical_IT, # 皮层 CT (Corticothalamic) grepl(GLUT_.*CT, Unified_CellType) ~ Cortical_CT, # 皮层 ET/PT (Extratelencephalic) grepl(GLUT_.*ET, Unified_CellType) ~ Cortical_ET, # 皮层 NP (Near-projecting) grepl(GLUT_.*NP, Unified_CellType) ~ Cortical_NP, # 皮层 L6b / CAR3 (特殊的深层群体) grepl(L6b|CAR3, Unified_CellType) ~ Cortical_L6b, # 海马体 (Hippocampal) grepl(GLUT_CA|GLUT_DG|GLUT_SUB, Unified_CellType) ~ Hippocampal_GLUT, # 皮层下/下丘脑/杏仁核等特异性 GLUT grepl(GLUT, Unified_CellType) !grepl(IT$|CT|ET|NP|L6b|CAR3|CA|DG|SUB, Unified_CellType) ~ Subcortical_GLUT, # 4. 其他特殊神经元 Unified_CellType DA ~ DA_Neuron, Unified_CellType HDC neuron ~ Histaminergic_Neuron, # 兜底 TRUE ~ Unknown ), # # 划分 Level 2 (谱系大类基于 Level 3 向上聚合) # Level_2 case_when( grepl(GABA, Level_3) ~ GABA, grepl(Cortical|Hippocampal|Subcortical|GLUT, Level_3) ~ GLUT, Level_3 %in% c(DA_Neuron, Histaminergic_Neuron) ~ Other_Neuron, Level_3 Progenitor_LRL ~ Progenitor, TRUE ~ Level_3 # 胶质细胞 (Astro, Oligo等) 在 Level 2 保持不变 ), # # 划分 Level 1 (神经元 vs 非神经元) # Level_1 case_when( Level_2 %in% c(GABA, GLUT, Other_Neuron) ~ Neuron, Level_2 %in% c(Astro, Oligo, Micro, OPC, Vasc, Epen_ChP) ~ Non_Neuron, TRUE ~ Other # 给发育早期的前体细胞留位置 ) ) # 将修改后的 meta 覆盖写回列表 all_metadata_list[[region]] - meta } cat(✅ 成功完成 Level 1, 2, 3 的多层级注释\n)细胞类型命名标准化说明 (Cell Type Nomenclature Standardization)为了在全脑/多脑区尺度上进行无缝的数据整合同时确保在不同编程环境如 R 和 Python中的语法兼容性我们对各个原始数据集涵盖 A1, Amygdala, Clastrum, Hippocampus, Hypothalamus, Striatum, V1 等区域的细胞亚群进行了系统性的重命名与符号清洗。主要标准化原则如下1. 系统兼容性符号清洗 (Syntax-Safe Formatting)为了避免特殊字符在下游分析算法或对象读取中引发报错执行了严格的符号过滤移除加号移除了所有细胞类型名称中的符号例如GLUT_AVP/GLUT_AVP被统一截断为GLUT_AVP。下划线替代连字符将所有的连字符-和多余空格全局替换为下划线_确保名称格式的连贯性例如原先转化生成的GLUT_L45-IT最终变为GLUT_L45_IT。2. 非神经元细胞简称映射 (Non-neurons Abbreviation)对所有非神经元细胞使用了规范化的高级缩写以优化降维图谱的图例展示效果。原命名 (Original Name)标准化简称 (Standardized Name)AstrocyteAstroOligodendrocyteOligoMicrogliaMicroVascular / Endothelial VascularVascEpendymalEpen3. 描述性长文本与特殊亚类映射 (Descriptive Nomenclature Mapping)针对过于口语化的长文本命名及特定高频出现的细胞亚类提取其核心解剖与投射属性并赋予机器可读的标准缩写。原命名 (Original Name)标准化简称 (Standardized Name)Upper-layer intratelencephalicGLUT_UL_ITDeep-layer intratelencephalicGLUT_DL_ITDeep-layer corticothalamic and 6bGLUT_DL_CT6bDeep-layer near-projectingGLUT_DL_NPAmygdala excitatoryGLUT_AMYMammillary bodyGLUT_MBMidbrain-derived inhibitoryGABA_MDICholinergic_GABAGABA_ChoPVALB (全称或包含 PVALB 的命名)PV(例如GABA_PV)PVALB Chandelier neuronsGABA_PV_ChaLAMP5-LHX6 and ChandelierGABA_LAMP5_LHX6_Cha4. 特征神经元与区域特异性整合 (Specific Neuronal Subtypes)统一了纹状体Striatum和海马体Hippocampus中具有高度特异性的细胞命名特别是将海马体的兴奋性群体全部归入GLUT_大类。原命名 (Original Name)标准化简称 (Standardized Name)Medium spiny neuronMSNEccentric medium spiny neuroneMSNGlu_SUBGLUT_SUBHippocampal dentate gyrusGLUT_DGHippocampal CA1-3GLUT_CAHippocampal CA4GLUT_CA45. 神经元大类前缀与皮层属性压缩 (Lineage Prefixing Layer Formatting)强制前缀统一所有神经元均使用标准大写递质前缀GLUT_或GABA_。原始数据中发育起源CGE_,MGE_及各种变体拼写Glut,Glu_,Gaba_均被强制归一化。皮层属性紧凑化针对皮层兴奋性神经元提取了层级范围与投射属性整合为单一下划线连接的标准结构。L4/5 IT➡️GLUT_L45_ITL3-L6 IT➡️GLUT_L36_ITL6 CT➡️GLUT_L6_CT层级命名对应表Level 1 (大类)Level 2 (谱系)Level 3 (发育起源/投射属性/解剖亚区)包含的原始细胞类型 (Unified_CellType)Non-NeuronAstroAstroAstroOligoOligoOligoMicroMicroMicroOPCOPCOPCVascVascVascEpen/ChPEpen/ChPEpen, ChPNeuronGABAGABA_MGE(内侧起源)GABA_PV, GABA_SST, GABA_SOX6, GABA_LHX6, GABA_PV_Cha 等GABA_CGE(尾侧起源)GABA_VIP, GABA_LAMP5, GABA_RELN, GABA_CCK 等GABA_LGE / MSN(外侧/纹状体)MSN, eMSN, MSN_DRD1, MSN_DRD2GABA_Diencephalon(间脑局部)下丘脑的大量特征GABA (GABA_AGRP, GABA_POMC 等)GLUTCortical_IT(端脑内)GLUT_L.*IT (L2IT, L34IT, UL_IT, DL_IT 等)Cortical_CT(丘脑投射)GLUT_L6CT, GLUT_DL_CT6bCortical_ET(端脑外)GLUT_L5ETCortical_NP(近距离)GLUT_L56NP, GLUT_DL_NPHippocampal(海马)GLUT_CA, GLUT_DG, GLUT_CA4, GLUT_SUBSubcortical_GLUT(皮层下)GLUT_AMY, 下丘脑各类 GLUT (GLUT_OXT, AVP 等)DA/OtherDA_NeuronDA (多巴胺), HDC neuron (组胺)