Switch from individual to global BED / BIM / FAM files.
authorDimitri Papadopoulos <dimitri.papadopoulos@cea.fr>
Fri, 12 Jul 2013 16:47:39 +0200
changeset 294 2a9b2a5acf8e
parent 293 7b637a322eb2
child 295 6a88c772d7cd
Switch from individual to global BED / BIM / FAM files. * Read FAM file as a CSV file and extract subject identifiers from there. * All subjects are relate to the same BED file, Localizer94.bed. Additionnaly, get rid of the sprintBack subdirectory of genetics. I have moved all necessary hg1?.refGene.* files out of this subdirectory.
importers/localizer.py
--- a/importers/localizer.py	Wed Jul 17 13:16:01 2013 +0200
+++ b/importers/localizer.py	Fri Jul 12 16:47:39 2013 +0200
@@ -278,17 +278,23 @@
 ### Genomics entities #########################################################
 ###############################################################################
 # XXX These functions may be pushed in helpers, as they may be more general
-def import_genomic_measures(genetics_dir):
+def import_genomic_measures(measure_path, genetics_basename):
     """Import a genomic measures"""
-    measure_path = os.path.join(genetics_dir, 'sprintBack', 'bysubj')
     g_measures = {}
-    for path in glob.glob(os.path.join(measure_path, '*.bim')):
-        subject_id = os.path.split(path)[1].split('.bim')[0]
+    # path to BED / BIM / FAM files
+    bim_path = os.path.join(measure_path, genetics_basename + '.bim')
+    fam_path = os.path.join(measure_path, genetics_basename + '.fam')
+    # read FAM file as CSV file
+    fam_file = open(fam_path, 'rU')
+    fam_reader = csv.reader(fam_file, delimiter=' ')
+    # one subject per line
+    for row in fam_reader:
+        subject_id = row[1]
         genomic_measure = {}
         genomic_measure['identifier'] = u'genomic_measure_%s' % subject_id
         genomic_measure['type'] = u'SNP'
         genomic_measure['format'] = u'plink'
-        genomic_measure['filepath'] = unicode(path)
+        genomic_measure['filepath'] = unicode(bim_path)
         genomic_measure['chip_serialnum'] = None
         genomic_measure['completed'] = True
         genomic_measure['valid'] = True
@@ -334,7 +340,7 @@
         chr_map.setdefault(_chr['name'], _chr.eid)
     # Genes
     genes = import_genes(os.path.join(genetics_dir, 'chromosomes.json'),
-                         os.path.join(genetics_dir, 'sprintBack', 'hg18.refGene.meta'))
+                         os.path.join(genetics_dir, 'hg18.refGene.meta'))
     for gene in genes:
         print 'gene', gene['name'], gene['chromosome']
         gene['chromosome'] = chr_map[gene['chromosome']]
@@ -344,7 +350,7 @@
         store.flush()
     # Snps
     snps = import_snps(os.path.join(genetics_dir, 'chromosomes.json'),
-                       os.path.join(genetics_dir, 'sprintBack', 'bysubj', 'bru3506.bim'))
+                       os.path.join(genetics_dir, 'Localizer94.bim'))
     snp_eids = []
     for ind, snp in enumerate(snps):
         print 'snp', snp['rs_id']
@@ -363,7 +369,7 @@
         store.relate(platform.eid, 'related_snps', snp_eid)
 
     ### Genetics measures #####################################################
-    gen_measures = import_genomic_measures(genetics_dir)
+    gen_measures = import_genomic_measures(genetics_dir, 'Localizer94')
 
     # Flush/Commit
     if sqlgen_store: