-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathphredsort.go
More file actions
320 lines (281 loc) · 10.7 KB
/
phredsort.go
File metadata and controls
320 lines (281 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
package main
import (
"fmt"
"math"
"os"
"strings"
"github.com/fatih/color"
"github.com/maruel/natural"
"github.com/spf13/cobra"
)
const (
VERSION = "1.5.0"
PHRED_OFFSET = 33
DEFAULT_MIN_PHRED = 15 // min Phred score threshold for `lqcount` and `lqpercent` metrics
)
// Mock exit function for testing
var exitFunc = os.Exit
// QualityMetric represents different methods for calculating sequence quality
type QualityMetric int
const (
AvgPhred QualityMetric = iota
MaxEE
Meep
LQCount
LQPercent
)
// String returns the string representation of a QualityMetric
func (m QualityMetric) String() string {
switch m {
case AvgPhred:
return "avgphred"
case MaxEE:
return "maxee"
case Meep:
return "meep"
case LQCount:
return "lqcount"
case LQPercent:
return "lqpercent"
default:
return "unknown"
}
}
// validateMetric parses and validates a metric string, returning the corresponding
// QualityMetric enum value. Returns an error if the metric string is invalid
//
// Valid metric strings: "avgphred", "maxee", "meep", "lqcount", "lqpercent"
//
// Example:
// metric, err := validateMetric("avgphred") // Returns AvgPhred, nil
// metric, err := validateMetric("invalid") // Returns AvgPhred, error
func validateMetric(metricStr string) (QualityMetric, error) {
switch strings.ToLower(metricStr) {
case "avgphred":
return AvgPhred, nil
case "maxee":
return MaxEE, nil
case "meep":
return Meep, nil
case "lqcount":
return LQCount, nil
case "lqpercent":
return LQPercent, nil
default:
return AvgPhred, fmt.Errorf("invalid metric '%s'. Must be one of: avgphred, maxee, meep, lqcount, lqpercent", metricStr)
}
}
// QualityRecord stores just the essential info for sorting
type QualityRecord struct {
Offset int64 // File offset for a record
AvgQual float64 // Average quality score
}
// QualityFloat is for sorting sequences by quality scores
type QualityFloat struct {
Name string
Value float64
Metric QualityMetric
}
// QualityFloatList is a slice of QualityFloat with sorting direction
// It implements sort.Interface for sorting sequences by quality metrics
type QualityFloatList struct {
items []QualityFloat
ascending bool // true for ascending, false for descending
}
// NewQualityFloatList creates a new QualityFloatList with the given items and sort direction
func NewQualityFloatList(items []QualityFloat, ascending bool) QualityFloatList {
return QualityFloatList{items: items, ascending: ascending}
}
func (list QualityFloatList) Len() int { return len(list.items) }
func (list QualityFloatList) Swap(i, j int) {
list.items[i], list.items[j] = list.items[j], list.items[i]
}
// Less implements sort.Interface. It determines the sort order based on the metric type:
// - For AvgPhred (higher is better): default order is highest to lowest
// - For MaxEE, Meep, LQCount, LQPercent (lower is better): default order is lowest to highest
// - The ascending flag flips the default ordering
// - Secondary sort is by name using natural ordering (e.g., seq1, seq2, seq10)
func (list QualityFloatList) Less(i, j int) bool {
metric := list.getMetricFromValue()
// Compare values based on metric type
var result bool
if metric == MaxEE || metric == Meep || metric == LQCount || metric == LQPercent {
// For these metrics, higher values indicate lower quality
if list.items[i].Value != list.items[j].Value {
result = list.items[i].Value < list.items[j].Value
} else {
result = natural.Less(list.items[i].Name, list.items[j].Name)
}
} else {
// For other metrics (e.g., AvgPhred), higher values indicate better quality
if list.items[i].Value != list.items[j].Value {
result = list.items[i].Value > list.items[j].Value
} else {
result = natural.Less(list.items[i].Name, list.items[j].Name)
}
}
// Flip the result if we want ascending order
if list.ascending {
return !result
}
return result
}
// Helper function to get metric from QualityFloatList
func (list QualityFloatList) getMetricFromValue() QualityMetric {
if len(list.items) > 0 {
return list.items[0].Metric
}
return AvgPhred // Default
}
// Helper function to get the underlying items
func (list QualityFloatList) Items() []QualityFloat {
return list.items
}
// QualityIndex is a memory-efficient alternative to QualityFloat for sorting.
// Uses int32 index (4 bytes) + float32 value (4 bytes) = 8 bytes total
// compared to QualityFloat which uses ~50+ bytes due to string storage
type QualityIndex struct {
Index int32 // Position in records slice
Value float32 // Quality score (float32 provides sufficient precision)
}
// QualityIndexList implements sort.Interface for memory-efficient sorting.
// It references an external names slice for natural sort tie-breaking
type QualityIndexList struct {
items []QualityIndex
names []string // External reference for tie-breaking
ascending bool
metric QualityMetric
}
// NewQualityIndexList creates a new QualityIndexList with the given items and sort direction
func NewQualityIndexList(items []QualityIndex, names []string, ascending bool, metric QualityMetric) *QualityIndexList {
return &QualityIndexList{
items: items,
names: names,
ascending: ascending,
metric: metric,
}
}
func (list *QualityIndexList) Len() int { return len(list.items) }
func (list *QualityIndexList) Swap(i, j int) {
list.items[i], list.items[j] = list.items[j], list.items[i]
}
// Less implements sort.Interface with the same semantics as QualityFloatList
func (list *QualityIndexList) Less(i, j int) bool {
vi, vj := list.items[i].Value, list.items[j].Value
var result bool
if list.metric == MaxEE || list.metric == Meep || list.metric == LQCount || list.metric == LQPercent {
// For these metrics, lower values indicate better quality
if vi != vj {
result = vi < vj
} else {
// Tie-break using natural sort on names
result = natural.Less(list.names[list.items[i].Index], list.names[list.items[j].Index])
}
} else {
// For AvgPhred and other metrics, higher values indicate better quality
if vi != vj {
result = vi > vj
} else {
result = natural.Less(list.names[list.items[i].Index], list.names[list.items[j].Index])
}
}
if list.ascending {
return !result
}
return result
}
// Items returns the underlying items slice
func (list *QualityIndexList) Items() []QualityIndex {
return list.items
}
// Define color functions
var (
bold = color.New(color.Bold).SprintFunc()
cyan = color.New(color.FgCyan).SprintFunc()
yellow = color.New(color.FgYellow).SprintFunc()
red = color.New(color.FgRed).SprintFunc()
)
func getColorizedLogo() string {
symbols := []rune{'⣿', '⣶', '⣦', '⣄', '⣀'}
var logo strings.Builder
// Create gradient by assigning specific green intensities to each symbol
colors := []struct{ r, g, b int }{
{0, 255, 0}, // Brightest green
{0, 204, 0},
{0, 153, 0},
{0, 102, 0},
{0, 51, 0}, // Darkest green
}
for i, symbol := range symbols {
logo.WriteString(color.RGB(colors[i].r, colors[i].g, colors[i].b).Sprint(string(symbol)))
}
return logo.String()
}
// Variable declarations (at package level)
var (
// Command-line flags for the default sorting command
inFile string
outFile string
metric string
minPhred int
minQualFilter float64
maxQualFilter float64
headerMetrics string
ascending bool
compLevel int
version bool
)
func main() {
// Create root command
rootCmd := &cobra.Command{
Use: "phredsort",
Short: bold("Sorts FASTQ files by quality metrics"),
// When no subcommand is specified, run the default sorting behavior
Run: runDefaultCommand,
}
// The default command = quality estimation and sorting
defaultCmd := &cobra.Command{
Use: "sort",
Short: "Sort sequences by calculating quality metrics",
Run: runDefaultCommand,
}
// Define flags for the default sorting behavior
// The same flags are registered on both the root command and the explicit "sort" subcommand,
// so that both of the following commands are supported:
// phredsort -i in.fq -o out.fq ...
// phredsort sort -i in.fq -o out.fq ...
rootFlags := rootCmd.Flags()
rootFlags.StringVarP(&inFile, "in", "i", "-", "Input FASTQ file (default: stdin)")
rootFlags.StringVarP(&outFile, "out", "o", "-", "Output FASTQ file (default: stdout)")
rootFlags.StringVarP(&metric, "metric", "s", "avgphred", "Quality metric (avgphred, maxee, meep, lqcount, lqpercent)")
rootFlags.IntVarP(&minPhred, "minphred", "p", DEFAULT_MIN_PHRED, "Quality threshold for 'lqcount' and 'lqpercent' metrics")
rootFlags.Float64VarP(&minQualFilter, "minqual", "m", -math.MaxFloat64, "Minimum quality threshold for filtering")
rootFlags.Float64VarP(&maxQualFilter, "maxqual", "M", math.MaxFloat64, "Maximum quality threshold for filtering")
rootFlags.StringVarP(&headerMetrics, "header", "H", "", "Comma-separated list of metrics to add to headers (e.g., 'avgphred,maxee,length')")
rootFlags.BoolVarP(&ascending, "ascending", "a", false, "Sort sequences in ascending order of quality (default: descending)")
rootFlags.IntVarP(&compLevel, "compress", "c", 1, "Memory compression level for stdin-based mode (0=disabled, 1-22; default: 1)")
rootFlags.BoolVarP(&version, "version", "v", false, "Show version information")
sortFlags := defaultCmd.Flags()
sortFlags.StringVarP(&inFile, "in", "i", "-", "Input FASTQ file (default: stdin)")
sortFlags.StringVarP(&outFile, "out", "o", "-", "Output FASTQ file (default: stdout)")
sortFlags.StringVarP(&metric, "metric", "s", "avgphred", "Quality metric (avgphred, maxee, meep, lqcount, lqpercent)")
sortFlags.IntVarP(&minPhred, "minphred", "p", DEFAULT_MIN_PHRED, "Quality threshold for 'lqcount' and 'lqpercent' metrics")
sortFlags.Float64VarP(&minQualFilter, "minqual", "m", -math.MaxFloat64, "Minimum quality threshold for filtering")
sortFlags.Float64VarP(&maxQualFilter, "maxqual", "M", math.MaxFloat64, "Maximum quality threshold for filtering")
sortFlags.StringVarP(&headerMetrics, "header", "H", "", "Comma-separated list of metrics to add to headers (e.g., 'avgphred,maxee,length')")
sortFlags.BoolVarP(&ascending, "ascending", "a", false, "Sort sequences in ascending order of quality (default: descending)")
sortFlags.IntVarP(&compLevel, "compress", "c", 1, "Memory compression level for stdin-based mode (0=disabled, 1-22; default: 1)")
sortFlags.BoolVarP(&version, "version", "v", false, "Show version information")
// Add commands
rootCmd.AddCommand(defaultCmd) // sort using quality estimation
rootCmd.AddCommand(NoSortCommand()) // estimate quality without sorting
rootCmd.AddCommand(HeaderSortCommand()) // sort using pre-computed quality scores
// Set help function
rootCmd.SetHelpFunc(helpFunc)
// Execute
if err := rootCmd.Execute(); err != nil {
fmt.Fprintln(os.Stderr, red(err.Error()))
fmt.Fprintln(os.Stderr, red("Try 'phredsort --help' for more information"))
exitFunc(1)
}
}