-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathPolyTable_to_Fasta.py
More file actions
executable file
·49 lines (43 loc) · 1.67 KB
/
PolyTable_to_Fasta.py
File metadata and controls
executable file
·49 lines (43 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/evn python
# Script to convert a Hudson-like polymorphism table to a FASTA file for
# input into 'compute' (libsequence)
# The input file format does not have a typical Htable header, and is a
# matrix of genotyping data, so we also have to filter on monomorphic
# markers.
import sys
samples = []
genotypes = []
with open(sys.argv[1], 'r') as f:
for index, line in enumerate(f):
if index == 0:
continue
else:
tmp = line.strip().split('\t')
samples.append(tmp[0])
# Replace the diploid genotypes with haploid ones
hap_gt = [list(set(a))[0] if len(set(a))==1 else 'N' for a in tmp[1:]]
# Then, replaces the 'B' with 'C'
hap_gt_nuc = ['C' if a=='B' else a for a in hap_gt]
genotypes.append(hap_gt_nuc)
# next we transpose the matrix so we can iterate over markers instead of
# individuals.
genotypes = zip(*genotypes)
newmarkers = []
for marker in genotypes:
# cast it to a set, which gives just unique values
alleles = set(marker)
# if there are missing calls, ignore them for now
# we use set.discard() because it won't throw a KeyError if there is no N
alleles.discard('N')
# If the remaining set is only one item long, then the marker is
# monomorphic in the population and we remove it
if len(alleles) == 1:
continue
else:
newmarkers.append(marker)
# transpose it back so we can tack all the individuals together
newmarkers = zip(*newmarkers)
# next, we print out the FASTA file
for individual in zip(samples, newmarkers):
print '>' + individual[0]
print ''.join(list(individual[1]))