{
  "generated_at_utc": "2026-02-13T20:16:56Z",
  "source_scan_status": "complete",
  "data_sources_root": "C:\\Users\\jsw82\\Documents\\EV_dex\\data_sources",
  "sample_databases_root": "C:\\Users\\jsw82\\Documents\\GitHub\\EV_dex\\sample_databases",
  "source_families": [
    "CellMarker",
    "CellPhoneDB",
    "CellTalkDB",
    "Cellinker",
    "DisGeNET",
    "EV-Track",
    "Ensembl BioMart",
    "ExoCarta",
    "Gene Ontology",
    "Human Protein Atlas",
    "KEGG",
    "OmniPath Intercell",
    "PanglaoDB",
    "RNALocate",
    "RNAcentral",
    "Reactome",
    "Reactome LRdb",
    "STRING",
    "SVAtlas",
    "TarBase",
    "TargetScan",
    "UniProt",
    "Vesiclepedia",
    "WikiPathways",
    "miRBase",
    "miRNA-atlas",
    "miRTarBase",
    "mirMine"
  ],
  "datasets": [
    {
      "dataset_id": "cell_specificity_unified",
      "parquet_file": "cell_specificity_unified.parquet",
      "row_count": 1587444,
      "columns": [
        "Gene_ID",
        "Gene_Symbol",
        "Cell_Type",
        "System",
        "Expression_Value",
        "Expression_Level",
        "Source",
        "Is_Marker"
      ],
      "upstream_sources": [
        "CellMarker",
        "Human Protein Atlas",
        "PanglaoDB"
      ],
      "raw_input_files": [
        "08_cell_specificity/Cell_marker_Human.xlsx",
        "08_cell_specificity/PanglaoDB_Human.tsv",
        "08_cell_specificity/cell_to_system_map.tsv",
        "08_cell_specificity/rna_single_cell_type.tsv"
      ],
      "processing_scripts": [
        "08_cell_specificity/process_cell_specificity.py"
      ],
      "key_ids": [
        "Gene_ID",
        "Gene_Symbol"
      ],
      "where_used": [
        "/api/cell_specificity",
        "/api/cell_communication_v2"
      ],
      "update_notes": "Cell-level expression/marker support aggregated to system-level labels."
    },
    {
      "dataset_id": "ev_evidence",
      "parquet_file": "ev_evidence.parquet",
      "row_count": 234090,
      "columns": [
        "Molecule_Standard_ID",
        "Molecule_Type",
        "PubMed_ID",
        "Source_Database",
        "Sample_Type",
        "Isolation_Method",
        "Detection_Method",
        "Original_Evidence_ID"
      ],
      "upstream_sources": [
        "EV-Track",
        "ExoCarta",
        "SVAtlas",
        "Vesiclepedia"
      ],
      "raw_input_files": [
        "05_EV_databases/evtrack_data.xlsx",
        "05_EV_databases/exocarta_mirnas.txt",
        "05_EV_databases/exocarta_mrna_protein.txt",
        "05_EV_databases/svatlas_data/*/project_marker.csv",
        "05_EV_databases/vesiclepedia_mirnas.txt",
        "05_EV_databases/vesiclepedia_proteins_mrnas.txt"
      ],
      "processing_scripts": [
        "05_EV_databases/process_exocarta.py",
        "05_EV_databases/process_svatlas.py",
        "05_EV_databases/process_vesiclepedia.py",
        "05_EV_databases/unify_evidence.py"
      ],
      "key_ids": [
        "PubMed_ID"
      ],
      "where_used": [
        "/api/details",
        "EV Evidence tab",
        "Landing metrics"
      ],
      "update_notes": "Molecule-level EV detection evidence with PubMed and protocol fields."
    },
    {
      "dataset_id": "gene_annotations",
      "parquet_file": "gene_annotations.parquet",
      "row_count": 19424,
      "columns": [
        "gene_symbol",
        "is_secreted",
        "is_membrane"
      ],
      "upstream_sources": [
        "UniProt"
      ],
      "raw_input_files": [
        "10_protein_secreated_membrane/membrane_genes.tsv",
        "10_protein_secreated_membrane/secreted_genes.tsv"
      ],
      "processing_scripts": [
        "10_protein_secreated_membrane/create_annotations.py"
      ],
      "key_ids": [],
      "where_used": [
        "/api/details",
        "Summary EV tags"
      ],
      "update_notes": "Binary secreted/membrane annotations for protein-centric interpretation."
    },
    {
      "dataset_id": "gene_disease_associations",
      "parquet_file": "gene_disease_associations.parquet",
      "row_count": 501157,
      "columns": [
        "Gene_ID",
        "Gene_Symbol",
        "Disease_ID",
        "Disease_Name",
        "Score",
        "Association_Type",
        "Evidence_Level",
        "Source",
        "Publication_Year",
        "Publication_ID",
        "Publication_URL",
        "MeSH_Disease_Classes",
        "Disease_Type",
        "Gene_DSI",
        "Gene_DPI",
        "Supporting_Sentence"
      ],
      "upstream_sources": [
        "DisGeNET"
      ],
      "raw_input_files": [
        "09_gene_diseases/gene_disease_associations.csv",
        "09_gene_diseases/genes.csv"
      ],
      "processing_scripts": [
        "09_gene_diseases/process_disgenet.py"
      ],
      "key_ids": [
        "Disease_ID",
        "Gene_ID",
        "Gene_Symbol"
      ],
      "where_used": [
        "/api/collective_disease_analysis",
        "/api/details"
      ],
      "update_notes": "Gene-disease evidence with Disease_ID grouping and publication metadata."
    },
    {
      "dataset_id": "gene_expression",
      "parquet_file": "gene_expression.parquet",
      "row_count": 2479776,
      "columns": [
        "Gene_ID",
        "Localization_System",
        "Localization_Specific",
        "Source_Database",
        "Evidence_Type",
        "Normalized_Score",
        "Protein_Score_Normalized",
        "Protein_Level",
        "RNA_TPM",
        "RNALocate_Score",
        "Tissue_Raw"
      ],
      "upstream_sources": [
        "Human Protein Atlas",
        "RNALocate"
      ],
      "raw_input_files": [
        "02_hpa_expression/normal_tissue.tsv",
        "02_hpa_expression/rna_consensus.tsv",
        "04_localization_mapping/localization_map.csv",
        "04_localization_mapping/rnalocate_experimental_mRNA.txt",
        "04_localization_mapping/rnalocate_predicted_mRNA.txt"
      ],
      "processing_scripts": [
        "02_hpa_expression/process_gene_expression.py"
      ],
      "key_ids": [
        "Gene_ID"
      ],
      "where_used": [
        "/api/details",
        "Collective Expression"
      ],
      "update_notes": "Gene tissue/localization expression with source-specific evidence labels."
    },
    {
      "dataset_id": "gene_pathways",
      "parquet_file": "gene_pathways.parquet",
      "row_count": 404758,
      "columns": [
        "Gene_ID",
        "Pathway_Name",
        "Pathway_Category",
        "Pathway_Source",
        "Pathway_Description"
      ],
      "upstream_sources": [
        "Gene Ontology",
        "KEGG",
        "Reactome",
        "WikiPathways"
      ],
      "raw_input_files": [
        "06_pathway_enrichment/ensembl_to_go.txt",
        "06_pathway_enrichment/ensembl_to_ncbi.txt",
        "06_pathway_enrichment/ensembl_to_reactome.txt",
        "06_pathway_enrichment/wikipathways_homo_sapiens.txt"
      ],
      "processing_scripts": [
        "06_pathway_enrichment/process_pathways.py"
      ],
      "key_ids": [
        "Gene_ID",
        "Pathway_Name"
      ],
      "where_used": [
        "/api/pathway_enrichment",
        "Pathway Analysis tab"
      ],
      "update_notes": "Gene-pathway memberships used for hypergeometric enrichment and category views."
    },
    {
      "dataset_id": "genes",
      "parquet_file": "genes.parquet",
      "row_count": 21763,
      "columns": [
        "Gene_ID",
        "Gene_Symbol",
        "Full_Name"
      ],
      "upstream_sources": [
        "Ensembl BioMart",
        "UniProt"
      ],
      "raw_input_files": [
        "01_lookup_tables/ensembl_master_list.csv",
        "01_lookup_tables/uniprot_human_data.tsv"
      ],
      "processing_scripts": [
        "01_lookup_tables/process_master_list.py",
        "01_lookup_tables/process_uniprot.py"
      ],
      "key_ids": [
        "Gene_ID",
        "Gene_Symbol"
      ],
      "where_used": [
        "/api/pathway",
        "/api/details",
        "Identifier linking"
      ],
      "update_notes": "Canonical gene dictionary keyed by Gene_ID."
    },
    {
      "dataset_id": "ligand_receptor_pairs_full",
      "parquet_file": "ligand_receptor_pairs_full.parquet",
      "row_count": 25779,
      "columns": [
        "ligand_gene_symbol",
        "receptor_gene_symbol",
        "is_directed",
        "is_stimulation",
        "is_inhibition",
        "sources",
        "references"
      ],
      "upstream_sources": [
        "CellPhoneDB",
        "CellTalkDB",
        "Cellinker",
        "OmniPath Intercell",
        "Reactome LRdb"
      ],
      "raw_input_files": [
        "07_cell_communication/complex_input.csv",
        "07_cell_communication/interaction_input.csv",
        "07_cell_communication/protein_input.csv"
      ],
      "processing_scripts": [
        "07_cell_communication/download_lr_pairs.py",
        "07_cell_communication/download_omnipath_full.py",
        "07_cell_communication/process_cellphone_db.py"
      ],
      "key_ids": [],
      "where_used": [
        "/api/cell_communication_v2",
        "/api/ligand_receptor_analysis"
      ],
      "update_notes": "Directed ligand-receptor interactions with multi-source provenance strings."
    },
    {
      "dataset_id": "miRNA_expression",
      "parquet_file": "miRNA_expression.parquet",
      "row_count": 291427,
      "columns": [
        "miRBase_ID",
        "Localization_System",
        "Localization_Specific",
        "Expression_Value",
        "Expression_Level",
        "Normalized_Score_0_1",
        "Source_Database",
        "Evidence_Type"
      ],
      "upstream_sources": [
        "RNALocate",
        "miRNA-atlas",
        "mirMine"
      ],
      "raw_input_files": [
        "04_localization_mapping/localization_map.csv",
        "04_localization_mapping/mirmine.xlsx",
        "04_localization_mapping/mirna_atlas.csv",
        "04_localization_mapping/rnalocate_experimental.txt",
        "04_localization_mapping/rnalocate_predicted.txt"
      ],
      "processing_scripts": [
        "04_localization_mapping/process_mirna_expression.py"
      ],
      "key_ids": [
        "miRBase_ID"
      ],
      "where_used": [
        "/api/details",
        "Collective Expression"
      ],
      "update_notes": "miRNA localization and expression standardized to miRBase_ID."
    },
    {
      "dataset_id": "miRNA_targets_scored",
      "parquet_file": "miRNA_targets_scored.parquet",
      "row_count": 2639746,
      "columns": [
        "miRNA_ID",
        "Gene_ID",
        "Confidence_Score",
        "Source_Database"
      ],
      "upstream_sources": [
        "TarBase",
        "TargetScan",
        "miRTarBase"
      ],
      "raw_input_files": [
        "03_miRNA_targets/mirtarbase_human.csv",
        "03_miRNA_targets/tarbase_human.tsv",
        "03_miRNA_targets/targetscan_data.txt",
        "other/discarded_interactions_log.csv"
      ],
      "processing_scripts": [
        "03_miRNA_targets/process_advanced_targets.py",
        "03_miRNA_targets/unify_targets.py"
      ],
      "key_ids": [
        "Gene_ID",
        "miRNA_ID"
      ],
      "where_used": [
        "/api/pathway",
        "/api/details",
        "Pathway inference",
        "Network expansion"
      ],
      "update_notes": "Confidence_Score combines evidence tiers from experimental and predicted sources."
    },
    {
      "dataset_id": "miRNAs",
      "parquet_file": "miRNAs.parquet",
      "row_count": 2656,
      "columns": [
        "miRBase_ID",
        "miRNA_Name"
      ],
      "upstream_sources": [
        "miRBase"
      ],
      "raw_input_files": [
        "01_lookup_tables/miRNA.xlsx",
        "01_lookup_tables/miRNAs.csv"
      ],
      "processing_scripts": [
        "01_lookup_tables/process_mirbase.py"
      ],
      "key_ids": [
        "miRBase_ID"
      ],
      "where_used": [
        "/api/pathway",
        "/api/details",
        "miRNA annotation"
      ],
      "update_notes": "Canonical mature miRNA dictionary keyed by miRBase_ID."
    },
    {
      "dataset_id": "mirna_annotation_cache",
      "parquet_file": "mirna_annotation_cache.parquet",
      "row_count": 6,
      "columns": [
        "miRBase_ID",
        "rnacentral_id",
        "sequence",
        "sequence_length",
        "fetched_at",
        "source",
        "status",
        "error"
      ],
      "upstream_sources": [
        "RNAcentral"
      ],
      "raw_input_files": [
        "runtime API fetch cache"
      ],
      "processing_scripts": [
        "app.py RNAcentral annotation helpers"
      ],
      "key_ids": [
        "miRBase_ID"
      ],
      "where_used": [
        "/api/details",
        "miRNA summary annotation"
      ],
      "update_notes": "On-demand sequence cache with status, source, and fetch timestamp."
    },
    {
      "dataset_id": "mirna_disease_associations",
      "parquet_file": "mirna_disease_associations.parquet",
      "row_count": 145,
      "columns": [
        "miRNA_Name",
        "miRBase_ID",
        "Gene_Symbol",
        "Disease_ID",
        "Disease_Name",
        "Score",
        "Association_Type",
        "Evidence_Level",
        "Source",
        "Publication_Year",
        "Publication_ID",
        "Publication_URL",
        "MeSH_Disease_Classes",
        "Disease_Type",
        "Gene_DSI",
        "Gene_DPI",
        "Supporting_Sentence"
      ],
      "upstream_sources": [
        "DisGeNET"
      ],
      "raw_input_files": [
        "09_gene_diseases/miRNAs.csv",
        "09_gene_diseases/mirna_direct_disease_evidence.csv",
        "09_gene_diseases/mirna_disease_associations.csv"
      ],
      "processing_scripts": [
        "09_gene_diseases/process_disgenet_miRNA.py"
      ],
      "key_ids": [
        "Disease_ID",
        "Gene_Symbol",
        "miRBase_ID"
      ],
      "where_used": [
        "/api/collective_disease_analysis",
        "/api/details"
      ],
      "update_notes": "Direct miRNA disease evidence mapped through gene mediation fields."
    },
    {
      "dataset_id": "molecule_summary_data",
      "parquet_file": "molecule_summary_data.parquet",
      "row_count": 43762,
      "columns": [
        "id",
        "name",
        "description",
        "type",
        "ensp_id"
      ],
      "upstream_sources": [
        "STRING",
        "UniProt",
        "miRBase"
      ],
      "raw_input_files": [
        "01_lookup_tables/miRNAs.csv",
        "01_lookup_tables/string_annotations.txt",
        "01_lookup_tables/uniprot_human_data.tsv"
      ],
      "processing_scripts": [
        "01_lookup_tables/Summary_process.py"
      ],
      "key_ids": [],
      "where_used": [
        "/api/details",
        "Node Summary"
      ],
      "update_notes": "Unified short descriptions for miRNA, mRNA, and protein nodes."
    },
    {
      "dataset_id": "proteins",
      "parquet_file": "proteins.parquet",
      "row_count": 19343,
      "columns": [
        "Protein_ID",
        "Protein_Name",
        "Corresponding_Gene_ID"
      ],
      "upstream_sources": [
        "Ensembl BioMart",
        "UniProt"
      ],
      "raw_input_files": [
        "01_lookup_tables/ensembl_master_list.csv",
        "01_lookup_tables/uniprot_human_data.tsv"
      ],
      "processing_scripts": [
        "01_lookup_tables/process_master_list.py",
        "01_lookup_tables/process_uniprot.py"
      ],
      "key_ids": [
        "Corresponding_Gene_ID",
        "Protein_ID"
      ],
      "where_used": [
        "/api/pathway",
        "/api/details",
        "Protein mapping"
      ],
      "update_notes": "Protein_ID linked to Corresponding_Gene_ID for network expansion."
    },
    {
      "dataset_id": "publication_details",
      "parquet_file": "publication_details.parquet",
      "row_count": 3350,
      "columns": [
        "PubMed_ID",
        "EV_Track_ID",
        "EV_Track_Score_Percent",
        "Isolation_Protocols",
        "Sample_Type",
        "EV_Cell_Name",
        "First_Author",
        "Year",
        "Title"
      ],
      "upstream_sources": [
        "EV-Track"
      ],
      "raw_input_files": [
        "05_EV_databases/evtrack_data.xlsx"
      ],
      "processing_scripts": [
        "05_EV_databases/process_evtrack.py"
      ],
      "key_ids": [
        "PubMed_ID"
      ],
      "where_used": [
        "/api/details",
        "EV Evidence publication details"
      ],
      "update_notes": "Publication-level EV-Track metadata joined by PubMed_ID."
    },
    {
      "dataset_id": "string_interactions",
      "parquet_file": "string_interactions.parquet",
      "row_count": 13715404,
      "columns": [
        "protein1",
        "protein2",
        "neighborhood",
        "fusion",
        "cooccurence",
        "coexpression",
        "experimental",
        "database",
        "textmining",
        "combined_score"
      ],
      "upstream_sources": [
        "STRING"
      ],
      "raw_input_files": [
        "01_lookup_tables/string_annotations.txt"
      ],
      "processing_scripts": [
        "app startup loader"
      ],
      "key_ids": [],
      "where_used": [
        "/api/ppi_expand",
        "Protein network expansion"
      ],
      "update_notes": "High-volume protein interaction table used for dynamic PPI neighborhood expansion."
    },
    {
      "dataset_id": "subcellular_location",
      "parquet_file": "subcellular_location.parquet",
      "row_count": 13534,
      "columns": [
        "gene",
        "gene_name",
        "reliability",
        "main_location",
        "additional_location",
        "extracellular_location",
        "enhanced",
        "supported",
        "approved",
        "uncertain",
        "single-cell_variation_intensity",
        "single-cell_variation_spatial",
        "cell_cycle_dependency",
        "go_id"
      ],
      "upstream_sources": [
        "Human Protein Atlas"
      ],
      "raw_input_files": [
        "02_hpa_expression/normal_tissue.tsv"
      ],
      "processing_scripts": [
        "02_hpa_expression/process_hpa.py"
      ],
      "key_ids": [],
      "where_used": [
        "/api/details",
        "Subcellular tab"
      ],
      "update_notes": "Protein subcellular location attributes and GO terms."
    }
  ]
}
