Feature selection before feature-level data summarization

MSstatsSelectFeatures(input, method, top_n = 3, min_feature_count = 2)

Arguments

input

data.table

method

"all" / "highQuality", "topN"

top_n

number of features to use for "topN" method

min_feature_count

number of quality features for "highQuality" method

Value

data.table

Examples

raw = DDARawData method = "TMP" cens = "NA" impute = TRUE MSstatsConvert::MSstatsLogsSettings(FALSE) input = MSstatsPrepareForDataProcess(raw, 2, NULL)
#> INFO [2021-07-05 20:05:28] ** Features with one or two measurements across runs are removed. #> INFO [2021-07-05 20:05:28] ** Fractionation handled. #> INFO [2021-07-05 20:05:28] ** Updated quantification data to make balanced design. Missing values are marked by NA
input = MSstatsNormalize(input, "EQUALIZEMEDIANS") input = MSstatsMergeFractions(input) input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
#> INFO [2021-07-05 20:05:28] ** Log2 intensities under cutoff = 13.456 were considered as censored missing values. #> INFO [2021-07-05 20:05:28] ** Log2 intensities = NA were considered as censored missing values.
input_all = MSstatsSelectFeatures(input, "all") # all features
#> INFO [2021-07-05 20:05:28] ** Use all features that the dataset originally has.
input_5 = MSstatsSelectFeatures(data.table::copy(input), "topN", top_n = 5) # top 5 features
#> INFO [2021-07-05 20:05:28] ** Use top5 features that have highest average of log2(intensity) across runs.
input_informative = MSstatsSelectFeatures(input, "highQuality") # feature selection
#> INFO [2021-07-05 20:05:29] ** Flag uninformative feature and outliers by feature selection algorithm.
head(input_all)
#> PROTEIN PEPTIDE TRANSITION #> 1: bovine D.GPLTGTYR_23_23 NA_NA #> 2: bovine F.HFHWGSSDDQGSEHTVDR_402_402 NA_NA #> 3: bovine F.HWGSSDDQGSEHTVDR_229_229 NA_NA #> 4: bovine G.PLTGTYR_8_8 NA_NA #> 5: bovine H.SFNVEYDDSQDK_465_465 NA_NA #> 6: bovine K.AVVQDPALKPL_156_156 NA_NA #> FEATURE LABEL GROUP_ORIGINAL SUBJECT_ORIGINAL RUN #> 1: D.GPLTGTYR_23_23_NA_NA L C1 1 1 #> 2: F.HFHWGSSDDQGSEHTVDR_402_402_NA_NA L C1 1 1 #> 3: F.HWGSSDDQGSEHTVDR_229_229_NA_NA L C1 1 1 #> 4: G.PLTGTYR_8_8_NA_NA L C1 1 1 #> 5: H.SFNVEYDDSQDK_465_465_NA_NA L C1 1 1 #> 6: K.AVVQDPALKPL_156_156_NA_NA L C1 1 1 #> GROUP SUBJECT FRACTION INTENSITY ABUNDANCE originalRUN censored #> 1: 1 1 1 757400.1 19.83052 1 FALSE #> 2: 1 1 1 2087125.8 21.29291 1 FALSE #> 3: 1 1 1 1485145.8 20.80200 1 FALSE #> 4: 1 1 1 4986404.0 22.54939 1 FALSE #> 5: 1 1 1 2488141.2 21.54646 1 FALSE #> 6: 1 1 1 7519322.0 23.14200 1 FALSE
head(input_5)
#> PROTEIN PEPTIDE TRANSITION #> 1: bovine D.GPLTGTYR_23_23 NA_NA #> 2: bovine F.HFHWGSSDDQGSEHTVDR_402_402 NA_NA #> 3: bovine F.HWGSSDDQGSEHTVDR_229_229 NA_NA #> 4: bovine G.PLTGTYR_8_8 NA_NA #> 5: bovine H.SFNVEYDDSQDK_465_465 NA_NA #> 6: bovine K.AVVQDPALKPL_156_156 NA_NA #> FEATURE LABEL GROUP_ORIGINAL SUBJECT_ORIGINAL RUN #> 1: D.GPLTGTYR_23_23_NA_NA L C1 1 1 #> 2: F.HFHWGSSDDQGSEHTVDR_402_402_NA_NA L C1 1 1 #> 3: F.HWGSSDDQGSEHTVDR_229_229_NA_NA L C1 1 1 #> 4: G.PLTGTYR_8_8_NA_NA L C1 1 1 #> 5: H.SFNVEYDDSQDK_465_465_NA_NA L C1 1 1 #> 6: K.AVVQDPALKPL_156_156_NA_NA L C1 1 1 #> GROUP SUBJECT FRACTION INTENSITY ABUNDANCE originalRUN censored remove #> 1: 1 1 1 757400.1 19.83052 1 FALSE TRUE #> 2: 1 1 1 2087125.8 21.29291 1 FALSE TRUE #> 3: 1 1 1 1485145.8 20.80200 1 FALSE TRUE #> 4: 1 1 1 4986404.0 22.54939 1 FALSE FALSE #> 5: 1 1 1 2488141.2 21.54646 1 FALSE TRUE #> 6: 1 1 1 7519322.0 23.14200 1 FALSE FALSE
head(input_informative)
#> LABEL PROTEIN FEATURE originalRUN PEPTIDE TRANSITION #> 1: L bovine D.GPLTGTYR_23_23_NA_NA 1 D.GPLTGTYR_23_23 NA_NA #> 2: L bovine D.GPLTGTYR_23_23_NA_NA 10 D.GPLTGTYR_23_23 NA_NA #> 3: L bovine D.GPLTGTYR_23_23_NA_NA 11 D.GPLTGTYR_23_23 NA_NA #> 4: L bovine D.GPLTGTYR_23_23_NA_NA 12 D.GPLTGTYR_23_23 NA_NA #> 5: L bovine D.GPLTGTYR_23_23_NA_NA 13 D.GPLTGTYR_23_23 NA_NA #> 6: L bovine D.GPLTGTYR_23_23_NA_NA 14 D.GPLTGTYR_23_23 NA_NA #> GROUP_ORIGINAL SUBJECT_ORIGINAL RUN GROUP SUBJECT FRACTION INTENSITY #> 1: C1 1 1 1 1 1 757400.1 #> 2: C4 1 10 4 1 1 NA #> 3: C4 1 11 4 1 1 NA #> 4: C4 1 12 4 1 1 NA #> 5: C5 1 13 5 1 1 NA #> 6: C5 1 14 5 1 1 NA #> ABUNDANCE censored feature_quality is_outlier #> 1: 19.83052 FALSE Informative FALSE #> 2: NA TRUE Informative FALSE #> 3: NA TRUE Informative FALSE #> 4: NA TRUE Informative FALSE #> 5: NA TRUE Informative FALSE #> 6: NA TRUE Informative FALSE