-
Notifications
You must be signed in to change notification settings - Fork 0
Feature/refactor_DIMS_CollectFilled #97
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
4c937de
cbbf429
22d8347
367a5e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| # CollectFilled functions | ||
|
|
||
| collapse <- function(column_label, peakgroup_list, index_dup) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function name should be more descriptive |
||
| #' Collapse identification info for peak groups with the same mass | ||
| #' | ||
| #' @param column_label: Name of column in peakgroup_list (string) | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' @param index_dup: Index of duplicate peak group (integer) | ||
| #' | ||
| #' @return collapsed_items: Semicolon-separated list of info (string) | ||
| # get the item(s) that need to be collapsed | ||
| list_items <- as.vector(peakgroup_list[index_dup, column_label]) | ||
| # remove NA | ||
| if (length(which(is.na(list_items))) > 0) { | ||
| list_items <- list_items[-which(is.na(list_items))] | ||
| } | ||
| collapsed_items <- paste(list_items, collapse = ";") | ||
| return(collapsed_items) | ||
| } | ||
|
|
||
| merge_duplicate_rows <- function(peakgroup_list) { | ||
| #' Merge identification info for peak groups with the same mass | ||
| #' | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' | ||
| #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) | ||
|
|
||
| options(digits = 16) | ||
| collect <- NULL | ||
| remove <- NULL | ||
|
|
||
| # check for peak groups with identical mass | ||
| index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"])) | ||
|
|
||
| while (length(index_dup) > 0) { | ||
| # get the index for the peak group which is double | ||
| peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"]) | ||
| single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE] | ||
|
|
||
| # use function collapse to concatenate info | ||
| single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index) | ||
| if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA | ||
|
|
||
| # keep track of deduplicated entries | ||
| collect <- rbind(collect, single_peakgroup) | ||
| remove <- c(remove, peaklist_index) | ||
|
|
||
| # remove current entry from index | ||
| index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])] | ||
| } | ||
|
|
||
| # remove duplicate entries | ||
| if (!is.null(remove)) { | ||
| peakgroup_list <- peakgroup_list[-remove, ] | ||
| } | ||
| # append deduplicated entries | ||
| peakgroup_list_dedup <- rbind(peakgroup_list, collect) | ||
| return(peakgroup_list_dedup) | ||
| } | ||
|
|
||
| calculate_zscores_peakgrouplist <- function(peakgroup_list) { | ||
| #' Calculate Z-scores for peak groups based on average and standard deviation of controls | ||
| #' | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' | ||
| #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) | ||
|
|
||
| case_label <- "P" | ||
| control_label <- "C" | ||
|
Comment on lines
+72
to
+73
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are these always the same labels? Otherwise it make sense to add them as parameters |
||
| # get index for new column names | ||
| startcol <- ncol(peakgroup_list) + 3 | ||
|
|
||
| # calculate mean and standard deviation for Control group | ||
| ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE) | ||
| case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE) | ||
| int_cols <- c(ctrl_cols, case_cols) | ||
| # set all zeros to NA | ||
| peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA | ||
| ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE] | ||
| peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) | ||
| peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) | ||
|
|
||
| # set new column names and calculate Z-scores | ||
| colnames_zscores <- NULL | ||
| for (col_index in int_cols) { | ||
| col_name <- colnames(peakgroup_list)[col_index] | ||
| colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore")) | ||
| zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) - | ||
| peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls | ||
| peakgroup_list <- cbind(peakgroup_list, zscores_1col) | ||
| } | ||
|
|
||
| # apply new column names to columns at end plus avg and sd columns | ||
| colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores | ||
|
|
||
| return(peakgroup_list) | ||
| } | ||
|
|
||
| calculate_ppm_deviation <- function(peakgroup_list) { | ||
| #' Calculate ppm deviation between observed mass and expected theoretical mass | ||
| #' | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' | ||
| #' @return peakgroup_list_ppm: peak group list with ppm column (matrix) | ||
|
|
||
| # calculate ppm deviation | ||
| for (row_index in seq_len(nrow(peakgroup_list))) { | ||
| if (!is.na(peakgroup_list$theormz_HMDB[row_index]) && | ||
| !is.null(peakgroup_list$theormz_HMDB[row_index]) && | ||
| (peakgroup_list$theormz_HMDB[row_index] != "")) { | ||
| peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) - | ||
| as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) / | ||
| as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index])) | ||
| } else { | ||
| peakgroup_list$ppmdev[row_index] <- NA | ||
| } | ||
| } | ||
|
Comment on lines
+111
to
+121
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part is very hard to read. It would benefit from a refactor |
||
|
|
||
| return(peakgroup_list) | ||
| } | ||
|
|
||
| order_columns_peakgrouplist <- function(peakgroup_list) { | ||
| #' Put columns in peak group list in correct order | ||
| #' | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' | ||
| #' @return peakgroup_ordered: peak group list with columns in correct order (matrix) | ||
|
|
||
| original_colnames <- colnames(peakgroup_list) | ||
| mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames)) | ||
| descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames)) | ||
| intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1)) | ||
| # if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total | ||
| control_columns <- grep ("ctrls", original_colnames) | ||
| zscore_columns <- grep("_Zscore", original_colnames) | ||
| # create peak group list with columns in correct order | ||
| peakgroup_ordered <- peakgroup_list[ , c(mass_columns, descriptive_columns, intensity_columns, control_columns, zscore_columns)] | ||
|
|
||
| return(peakgroup_ordered) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| "mzmed.pgrp" "nrsamples" "C101.1" "C102.1" "P2.1" "P3.1" "assi_HMDB" "all_hmdb_names" "iso_HMDB" "HMDB_code" "all_hmdb_ids" "sec_hmdb_ids" "theormz_HMDB" "avg.int" "avg.ctrls" "sd.ctrls" "C101.1_Zscore" "C102.1_Zscore" "P2.1_Zscore" "P3.1_Zscore" "ppmdev" | ||
| "1" 300.199680958642 0.451108327135444 1000 5000 10000 50000 "A" "A;X" NA "HMDB1234567" "HMDB1234567;HMDB1234567" NA 300.1996476 16500 3000 2828.42712474619 9000 13000 90000 130000 0.111112214857712 | ||
| "2" 300.000315890415 0.498603057814762 2000 6000 20000 60000 "B" "B;Y" NA "HMDB1234567_1" "HMDB1234567_1;HMDB1234567_1" NA 300.00017417 22000 4000 2828.42712474619 10000 14000 1e+05 140000 0.473299680976197 | ||
| "3" 300.254185894039 0.589562055887654 3000 7000 30000 70000 "C" "C;Z" NA "HMDB1234567_2" "HMDB1234567_2;HMDB1234567_2" NA 300.25413357 27500 5000 2828.42712474619 11000 15000 110000 150000 0.17426158930175 | ||
| "4" 300.755745105678 0.277923040557653 4000 8000 40000 80000 "D" "D;V" NA "HMDB1234567_7" "HMDB1234567_7;HMDB1234567_7" NA 300.75568892 33000 6000 2828.42712474619 12000 16000 120000 160000 0.186787674436346 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| # unit tests for CollectFilled | ||
| # functions: collapse, merge_duplicate_rows, calculate_zscores_peakgrouplist, | ||
| # calculate_ppm_deviation, order_columns_peakgrouplist | ||
| source("../../preprocessing/collect_filled_functions.R") | ||
|
|
||
| testthat::test_that("Values for duplicate rows are correctly collapsed", { | ||
| test_matrix <- matrix(letters[1:8], nrow = 2, ncol = 4) | ||
| colnames(test_matrix) <- paste0("column", 1:4) | ||
|
|
||
| expect_equal(collapse("column1", test_matrix, c(1,2)), "a;b", TRUE) | ||
| }) | ||
|
|
||
| testthat::test_that("Duplicate rows in a peak group list are correctly merged", { | ||
| # Copy/symlink the files to the current location for the function | ||
| test_files <- list.files("fixtures/", "test_peakgroup_list", full.names = TRUE) | ||
| file.symlink(file.path(test_files), getwd()) | ||
|
|
||
| # read in peakgroup_list, create duplicate row | ||
| test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t") | ||
| test_peakgroup_list_dup <- test_peakgroup_list[c(1, 2, 2, 3), ] | ||
|
|
||
| # after merging duplicate rows, the test peak group list should have 3 rows | ||
| expect_equal(nrow(merge_duplicate_rows(test_peakgroup_list_dup)), 3, TRUE, tolerance = 0.001) | ||
| expect_equal(merge_duplicate_rows(test_peakgroup_list_dup)[3, "all_hmdb_ids"], | ||
| paste(test_peakgroup_list_dup[2, "all_hmdb_ids"], test_peakgroup_list_dup[3, "all_hmdb_ids"], sep = ";"), | ||
| TRUE) | ||
| }) | ||
|
|
||
| testthat::test_that("Z-scores are correctly calculated in CollectFilled", { | ||
| # read in peakgroup_list; remove avg.int, std.int, noise and Zscore columns | ||
| test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t") | ||
| # remove avg.ctrls, sd.ctrls and Z-score columns | ||
| test_peakgroup_list_noz <- test_peakgroup_list[ , -grep("avg.ctrls", colnames(test_peakgroup_list))] | ||
| test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("sd.ctrls", colnames(test_peakgroup_list_noz))] | ||
| test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("_Zscore", colnames(test_peakgroup_list_noz))] | ||
|
|
||
| # after calculate_zscores_peakgrouplist, there should be 4 columns with _Zscore in the name | ||
| expect_equal(length(grep("_Zscore", colnames(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)))), 4, TRUE, tolerance = 0.001) | ||
|
|
||
| # after calculate_zscores_peakgrouplist, the 4 columns with _Zscore in the name should be filled non-zero | ||
| expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$C101.1_Zscore[1], -0.7071, TRUE, tolerance = 0.00001) | ||
| expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$P2.1_Zscore[4], 12.0208, TRUE, tolerance = 0.00001) | ||
|
|
||
| }) | ||
|
|
||
| testthat::test_that("ppm deviation values are correctly calculated in CollectFilled", { | ||
| # read in peakgroup_list | ||
| test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t") | ||
|
|
||
| # store ppm deviation values | ||
| test_ppm_values <- test_peakgroup_list$ppmdev | ||
|
|
||
| # after calculate_ppm_deviation, there ppm values in the new column should approximate the old ones | ||
| expect_equal(calculate_ppm_deviation(test_peakgroup_list)$ppmdev, test_ppm_values, TRUE, tolerance = 0.001) | ||
|
|
||
| # calculate_ppm_deviation should give NA if there is no value for theormz_HMDB | ||
| test_peakgroup_list$theormz_HMDB[1] <- NA | ||
| expect_identical(is.na(calculate_ppm_deviation(test_peakgroup_list)$ppmdev[1]), TRUE) | ||
|
|
||
| }) | ||
|
|
||
| testthat::test_that("columns in peak group list are corretly sorted", { | ||
| # read in peakgroup_list | ||
| test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t") | ||
| # original order of columns | ||
| original_column_order <- colnames(test_peakgroup_list) | ||
| # after ordering, column names should be re-ordered | ||
| test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)] | ||
|
|
||
| expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order) | ||
|
|
||
| # Remove copied/symlinked files | ||
| files_remove <- list.files("./", "test_peakgroup_list.txt", full.names = TRUE) | ||
| file.remove(files_remove) | ||
|
|
||
| }) | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.