Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion include/kmtricks/cli/cli_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,27 @@ inline void add_common(bc::cmd_t cmd, km_options_t options)
->setter(options->verbosity);
}

}; // namespace kmdiff
template <typename T>
inline void add_bam_options(bc::cmd_t cmd, std::shared_ptr<T> options)
{
cmd->add_group("bam filtering options", "");

cmd->add_param("--bam-exclude-refs", "exclude reads mapping to specified references (comma-separated, e.g., chrM,chrY)")
->meta("STR")
->def("")
->setter(options->bam_exclude_refs);

cmd->add_param("-f/--bam-include-flags", "include reads with ALL these flags set (samtools -f style, decimal or hex with 0x prefix)")
->meta("INT")
->def("0")
->checker(bc::check::is_number)
->setter(options->bam_include_flags);

cmd->add_param("-F/--bam-exclude-flags", "exclude reads with ANY of these flags set (samtools -F style, decimal or hex with 0x prefix)")
->meta("INT")
->def("0")
->checker(bc::check::is_number)
->setter(options->bam_exclude_flags);
}

}; // namespace kmdiff
6 changes: 4 additions & 2 deletions include/kmtricks/cmd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,11 @@ struct main_repart
opt->repart_type,
1,
opt->nb_parts);
ConfigTask<MAX_K> config_task(opt->fof, props, opt->bloom_size, opt->nb_parts);
ConfigTask<MAX_K> config_task(opt->fof, props, opt->bloom_size, opt->nb_parts,
opt->bam_exclude_refs, opt->bam_include_flags, opt->bam_exclude_flags);
config_task.exec();
RepartTask<MAX_K> repart_task(opt->fof); repart_task.exec(); repart_task.postprocess();
RepartTask<MAX_K> repart_task(opt->fof, opt->bam_exclude_refs, opt->bam_include_flags, opt->bam_exclude_flags);
repart_task.exec(); repart_task.postprocess();

Storage* config_storage = StorageFactory(STORAGE_FILE).load(KmDir::get().m_config_storage);
LOCAL(config_storage);
Expand Down
7 changes: 7 additions & 0 deletions include/kmtricks/cmd/all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ struct all_options : km_options

std::string from;

std::string bam_exclude_refs;
uint32_t bam_include_flags {0};
uint32_t bam_exclude_flags {0};

MODE mode;
FORMAT format;
OUT_FORMAT out_format;
Expand Down Expand Up @@ -103,6 +107,9 @@ struct all_options : km_options
RECORD(ss, focus);
RECORD(ss, restrict_to);
RECORD(ss, bwidth);
RECORD(ss, bam_exclude_refs);
RECORD(ss, bam_include_flags);
RECORD(ss, bam_exclude_flags);
#ifdef WITH_PLUGIN
RECORD(ss, use_plugin);
RECORD(ss, plugin);
Expand Down
6 changes: 6 additions & 0 deletions include/kmtricks/cmd/repart.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ struct repart_options : km_options
uint32_t repart_type;
uint32_t nb_parts;
uint64_t bloom_size;
std::string bam_exclude_refs;
uint32_t bam_include_flags {0};
uint32_t bam_exclude_flags {0};

std::string display()
{
Expand All @@ -49,6 +52,9 @@ struct repart_options : km_options
RECORD(ss, minim_type);
RECORD(ss, repart_type);
RECORD(ss, nb_parts);
RECORD(ss, bam_exclude_refs);
RECORD(ss, bam_include_flags);
RECORD(ss, bam_exclude_flags);
std::string ret = ss.str(); ret.pop_back(); ret.pop_back();
return ret;
}
Expand Down
55 changes: 51 additions & 4 deletions include/kmtricks/task.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,42 @@
#include <kmtricks/plugin_manager.hpp>
#endif

#include <gatb/bank/impl/BankBam.hpp>

namespace km {

inline void apply_bam_filtering(IBank* bank, const std::string& bam_exclude_refs,
uint32_t bam_include_flags = 0, uint32_t bam_exclude_flags = 0) {
BankBam* bam_bank = dynamic_cast<BankBam*>(bank);
if (!bam_bank) return;

// Apply reference filtering
if (!bam_exclude_refs.empty()) {
std::set<std::string> excluded_refs;
std::istringstream ss(bam_exclude_refs);
std::string ref;
while (std::getline(ss, ref, ',')) {
// Trim whitespace
ref.erase(0, ref.find_first_not_of(" \t"));
ref.erase(ref.find_last_not_of(" \t") + 1);
if (!ref.empty()) {
excluded_refs.insert(ref);
}
}
if (!excluded_refs.empty()) {
bam_bank->setExcludedReferences(excluded_refs);
}
}

// Apply flag filtering (samtools -f/-F style)
if (bam_include_flags != 0) {
bam_bank->setRequireFlags(static_cast<uint16_t>(bam_include_flags));
}
if (bam_exclude_flags != 0) {
bam_bank->setExcludeFlags(static_cast<uint16_t>(bam_exclude_flags));
}
}

namespace fs = std::filesystem;
using parti_info_t = std::shared_ptr<PartiInfo<5>>;

Expand All @@ -52,8 +86,11 @@ class ConfigTask : public ITask
{
public:
ConfigTask(const std::string& path, IProperties* props, uint64_t bloom_size,
uint32_t partitions)
: ITask(0), m_path(path), m_props(props), m_bloom_size(bloom_size), m_nb_partitions(partitions)
uint32_t partitions, const std::string& bam_exclude_refs = "",
uint32_t bam_include_flags = 0, uint32_t bam_exclude_flags = 0)
: ITask(0), m_path(path), m_props(props), m_bloom_size(bloom_size), m_nb_partitions(partitions),
m_bam_exclude_refs(bam_exclude_refs), m_bam_include_flags(bam_include_flags),
m_bam_exclude_flags(bam_exclude_flags)
{}

void preprocess() {}
Expand All @@ -63,6 +100,7 @@ class ConfigTask : public ITask
spdlog::debug("[exec] - ConfigTask");
spdlog::info("{} samples found ({} read files).", KmDir::get().m_fof.size(), KmDir::get().m_fof.total());
IBank* bank = Bank::open(KmDir::get().m_fof.get_all()); LOCAL(bank);
apply_bam_filtering(bank, m_bam_exclude_refs, m_bam_include_flags, m_bam_exclude_flags);
Storage* config_storage =
StorageFactory(STORAGE_FILE).create(KmDir::get().m_config_storage, true, false);
LOCAL(config_storage);
Expand Down Expand Up @@ -90,6 +128,9 @@ class ConfigTask : public ITask
IProperties* m_props;
uint64_t m_bloom_size;
uint32_t m_nb_partitions;
std::string m_bam_exclude_refs;
uint32_t m_bam_include_flags;
uint32_t m_bam_exclude_flags;
};

void check_repart_compatibility(Configuration& c1, Configuration& c2,
Expand All @@ -109,8 +150,10 @@ template<size_t span>
class RepartTask : public ITask
{
public:
RepartTask(const std::string& path, const std::string& from = "")
: ITask(1), m_path(path), m_from(from) {}
RepartTask(const std::string& path, const std::string& bam_exclude_refs = "",
uint32_t bam_include_flags = 0, uint32_t bam_exclude_flags = 0, const std::string& from = "")
: ITask(1), m_path(path), m_bam_exclude_refs(bam_exclude_refs),
m_bam_include_flags(bam_include_flags), m_bam_exclude_flags(bam_exclude_flags), m_from(from) {}

void preprocess() {}
void postprocess()
Expand Down Expand Up @@ -140,6 +183,7 @@ class RepartTask : public ITask
{
Fof fof(m_path);
IBank* bank = Bank::open(fof.get_all()); LOCAL(bank);
apply_bam_filtering(bank, m_bam_exclude_refs, m_bam_include_flags, m_bam_exclude_flags);
Storage* rep_store =
StorageFactory(STORAGE_FILE).create(KmDir::get().m_repart_storage, true, false);

Expand Down Expand Up @@ -170,6 +214,9 @@ class RepartTask : public ITask

private:
std::string m_path;
std::string m_bam_exclude_refs;
uint32_t m_bam_include_flags;
uint32_t m_bam_exclude_flags;
std::string m_from;
int m_cores;
uint32_t m_nb_parts {0};
Expand Down
3 changes: 2 additions & 1 deletion include/kmtricks/task_scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ class TaskScheduler
1,
m_opt->nb_parts,
m_opt->max_memory);
ConfigTask<MAX_K> config_task(m_opt->fof, props, m_opt->bloom_size, m_opt->nb_parts);
ConfigTask<MAX_K> config_task(m_opt->fof, props, m_opt->bloom_size, m_opt->nb_parts,
m_opt->bam_exclude_refs, m_opt->bam_include_flags, m_opt->bam_exclude_flags);
config_task.exec();
Storage* config_storage = StorageFactory(STORAGE_FILE).load(KmDir::get().m_config_storage);
LOCAL(config_storage);
Expand Down
4 changes: 4 additions & 0 deletions src/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ km_options_t all_cli(std::shared_ptr<bc::Parser<1>> cli, all_options_t options)
->checker(bc::check::is_number)
->setter(options->bwidth);

add_bam_options(all_cmd, options);

#ifdef WITH_PLUGIN
auto plugin_setter = [options](const std::string& v) {
options->plugin = v;
Expand Down Expand Up @@ -428,6 +430,8 @@ km_options_t repart_cli(std::shared_ptr<bc::Parser<1>> cli, repart_options_t opt
->checker(bc::check::is_number)
->setter(options->bloom_size);

add_bam_options(repart_cmd, options);

add_common(repart_cmd, options);

return options;
Expand Down
Binary file added test.bam
Binary file not shown.
50 changes: 50 additions & 0 deletions tests/bam_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include <gtest/gtest.h>
#include <gatb/gatb_core.hpp>

using namespace gatb::core::bank;

class BamTest : public ::testing::Test {
protected:
void SetUp() override {
test_fasta = "data/1.fasta";
test_bam = "../test.bam";
}

std::string test_fasta;
std::string test_bam;
};

TEST_F(BamTest, FormatDetection) {
// Test that BAM and FASTA files are correctly detected
EXPECT_EQ(Bank::getType(test_bam), "bam");
EXPECT_EQ(Bank::getType(test_fasta), "fasta");
}

TEST_F(BamTest, SequenceEquality) {
// Test that BAM and FASTA return identical sequences
IBank* bam_bank = Bank::open(test_bam);
IBank* fasta_bank = Bank::open(test_fasta);

ASSERT_NE(bam_bank, nullptr);
ASSERT_NE(fasta_bank, nullptr);

gatb::core::tools::dp::Iterator<Sequence>* bam_it = bam_bank->iterator();
gatb::core::tools::dp::Iterator<Sequence>* fasta_it = fasta_bank->iterator();

int count = 0;
for (bam_it->first(), fasta_it->first();
!bam_it->isDone() && !fasta_it->isDone();
bam_it->next(), fasta_it->next()) {
count++;
EXPECT_EQ(bam_it->item().toString(), fasta_it->item().toString())
<< "Sequence " << count << " should match";
}

EXPECT_EQ(count, 2);
EXPECT_TRUE(bam_it->isDone() && fasta_it->isDone());

delete bam_it;
delete fasta_it;
delete bam_bank;
delete fasta_bank;
}
25 changes: 25 additions & 0 deletions tests/test_ref_filter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include <gatb/gatb_core.hpp>
#include <iostream>
#include <set>

using namespace gatb::core::bank;

int main() {
std::string test_bam = "../test.bam";

// First, read without filtering
std::cout << "Reading without filtering:" << std::endl;
IBank* bank1 = Bank::open(test_bam);
gatb::core::tools::dp::Iterator<Sequence>* it1 = bank1->iterator();
int count1 = 0;
for (it1->first(); !it1->isDone(); it1->next()) {
count1++;
std::cout << " Read " << count1 << ": " << it1->item().getComment() << std::endl;
}
delete it1;
delete bank1;

std::cout << "\nTotal reads without filtering: " << count1 << std::endl;

return 0;
}
2 changes: 2 additions & 0 deletions thirdparty/gatb-core-stripped/src/gatb/bank/impl/Bank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <gatb/bank/impl/Bank.hpp>

#include <gatb/bank/impl/BankFasta.hpp>
#include <gatb/bank/impl/BankBam.hpp>

#include <gatb/bank/impl/BankBinary.hpp>
#include <gatb/bank/impl/BankAlbum.hpp>
Expand All @@ -46,6 +47,7 @@ Bank::Bank ()
{
/** We register most known factories. */
_registerFactory_ ("album", new BankAlbumFactory(), false);
_registerFactory_ ("bam", new BankBamFactory(), false); // BAM before FASTA to prevent misdetection
_registerFactory_ ("fasta", new BankFastaFactory(), false);
_registerFactory_ ("binary", new BankBinaryFactory(), false);

Expand Down
Loading