setwd("~/Dropbox")
library(metabolomicsUM)
source("Datasets/Propolis/UVV/scripts/propolis_uv_metadata.R")
# reading data and metadata
uv.prop.metadata.file = "Datasets/Propolis/UVV/metadata/prop_uv_metadata.csv"
uv.prop.data.file = "Datasets/Propolis/UVV/data/uvv.csv"
get.metadata.uv(uv.prop.data.file, write.file = TRUE, file.name= uv.prop.metadata.file)
# need to check labels of x and values
label.x = "wavelength(nm)"
label.val = "absorbance"
uv.prop.ds = read.dataset.csv(uv.prop.data.file, uv.prop.metadata.file, description = "UV data for propolis", type = "uvv-spectra", label.x = label.x, label.values = label.val)
uv.prop.ds$metadata$years = factor(uv.prop.ds$metadata$years)
sum.dataset(uv.prop.ds)
## Dataset summary:
## Valid dataset
## Description: UV data for propolis
## Type of data: uvv-spectra
## Number of samples: 136
## Number of data points 501
## Number of metadata variables: 3
## Label of x-axis values: wavelength(nm)
## Label of data points: absorbance
## Number of missing values in data: 0
## Mean of data values: 71.93233
## Median of data values: 0.3097
## Standard deviation: 18529.24
## Range of values: 1e-04 4836665
## Quantiles:
## 0% 25% 50% 75% 100%
## 1.000000e-04 1.035000e-01 3.097000e-01 1.009725e+00 4.836665e+06
Seasons metadata used, no normalization or correction used.
PREPROCESSING
By plotting the entire spectrum we can see an outlier around the 500 and 600 wavelengths. The value will be replaced by linear interpolation.
no.seasons = which(is.na(uv.prop.ds$metadata$seasons))
uv.prop.ds = remove.samples(uv.prop.ds, no.seasons)
plot.spectra(uv.prop.ds,"seasons")
uv.prop.ds = replace.data.value(uv.prop.ds, 529, "ACpri11", NA)
uv.prop.ds = missingvalues.imputation(uv.prop.ds, method = "linapprox")
plot.spectra(uv.prop.ds,"seasons")
sum.dataset(uv.prop.ds)
## Dataset summary:
## Valid dataset
## Description: ; Missing value imputation with method linapprox
## Type of data: uvv-spectra
## Number of samples: 133
## Number of data points 501
## Number of metadata variables: 3
## Label of x-axis values: wavelength(nm)
## Label of data points: absorbance
## Number of missing values in data: 0
## Mean of data values: 0.9414083
## Median of data values: 0.3098
## Standard deviation: 1.293131
## Range of values: 1e-04 4.4118
## Quantiles:
## 0% 25% 50% 75% 100%
## 0.0001 0.1046 0.3098 0.9885 4.4118
UNIVARIATE TESTS
An analysis of variance (ANOVA) was conducted over the data with tukey test also, and this is the top 10 results ordered by p-value:
uv.prop.anova = aov.all.vars(uv.prop.ds, "seasons")
uv.prop.anova[1:10,]
## pvalues logs fdr tukey
## 293 1.830690e-07 6.737385 4.706756e-05 out-inv; pri-inv; ver-pri
## 292 1.878944e-07 6.726086 4.706756e-05 out-inv; pri-inv; ver-pri
## 288 6.836297e-07 6.165179 1.141662e-04 out-inv; pri-inv; ver-pri
## 295 1.129625e-06 5.947066 1.402090e-04 out-inv; pri-inv; ver-pri
## 291 1.420224e-06 5.847643 1.402090e-04 out-inv; pri-inv; ver-pri
## 294 1.679150e-06 5.774911 1.402090e-04 out-inv; pri-inv; ver-pri
## 290 2.379572e-06 5.623501 1.703094e-04 out-inv; pri-inv; ver-pri
## 296 3.312019e-06 5.479907 2.074152e-04 out-inv; pri-inv; ver-pri
## 289 5.519455e-06 5.258104 3.072497e-04 out-inv; pri-inv; ver-pri
## 297 1.083861e-05 4.965026 5.430143e-04 out-inv; pri-inv; ver-pri
A heatmap with the correlations between all the variables is shown below:
uv.prop.correl = correlations.dataset(uv.prop.ds, method = "pearson")
heatmap(uv.prop.correl, col = topo.colors(256))
CLUSTERING
Hierarchical clustering with euclidean distance and complete method was performed on the data and the resulting dendrogram is shown below:
uv.prop.hc = clustering(uv.prop.ds, method = "hc", distance = "euclidean")
dendrogram.plot.col(uv.prop.ds, uv.prop.hc, "seasons")
K-Means was performed on the data also with 4 centers and the results and the plot giving for each cluster the median of the samples in blue, and in grey the values of all samples in that cluster are shown below:
uv.prop.kmeans = clustering(uv.prop.ds, method = "kmeans", num.clusters = 3)
kmeans.plot(uv.prop.ds, uv.prop.kmeans)
kmeans.result.df(uv.prop.kmeans, 3)
## cluster
## 1 1
## 2 2
## 3 3
## samples
## 1 ANout10 BGout10 BR1out10 JBout10 SJ1out10 SJ2out10 URout10 XXout10 ACinv10 ANinv10 BGinv10 BR2inv10 FPinv10 PUinv10 URinv10 XXinv10 ANpri10 BGpri10 FPpri10 SJ1pri10 SJ2pri10 URpri10 BGver10 BR1ver10 BR2ver10 JBver10 BGout11 CEout11 FPout11 SJ2out11 XXout11 BR2inv11 PUinv11 BGpri11 BR2pri11 BR1pri11 ITpri11 JBpri11 SJ1pri11 SJCpri11 URpri11 ANver11 SJ2inv11 XXinv11 BGver11 FPver11 SJ1ver11 URver11 VRver11
## 2 ACout10 BR2out10 CEout10 DCinv10 ACpri10 CEpri10 CNpri10 PUpri10 SJCpri10 XXpri10 ANver10 CEver10 CNver10 ITver10 PUver10 SJCver10 ITout11 BR1inv11 CEinv11 CNinv11 ACpri11 CEpri11 VRpri11 SJCinv11 BR1ver11 CEver11 CNver11 ITver11 PUver11
## 3 CNout10 CNOout10 DCout10 FPout10 ITout10 PUout10 SAout10 SJCout10 VRout10 CEinv10 CNinv10 CNOinv10 ITinv10 JBinv10 SAinv10 SJCinv10 BR1pri10 BR2pri10 DCpri10 ITpri10 JBpri10 SApri10 VRpri10 ACver10 DCver10 FPver10 SAver10 SJ2ver10 URver10 VRver10 ACout11 ANout11 CNout11 CNOout11 DCout11 PUout11 SAout11 SJCout11 VRout11 ACinv11 ANinv11 BGinv11 CNOinv11 DCinv11 FPinv11 SAinv11 CNpri11 DCpri11 FPpri11 PUpri11 BR2ver11 JBver11 SAver11 SJ2ver11 SJCver11
PCA
Principal components analysis was performed on the data and some plots are shown below:
uv.prop.pca = pca.analysis.dataset(uv.prop.ds, scale = T, center = T)
pca.pairs.plot(uv.prop.ds, uv.prop.pca, "seasons")
pca.screeplot(uv.prop.pca)
pca.scoresplot2D(uv.prop.ds, uv.prop.pca, "seasons", ellipses = T)
pca.kmeans.plot2D(uv.prop.ds, uv.prop.pca, ellipses = T, num.clusters = 4, labels = T)
MACHINE LEARNING
For classification models and prediction the following parameters were used: - models: PLS, J48, JRip, SVM and Random Forests - validation method: repeated cross-validation - number of folds: 10 - number of repeats: 10
Below are some results with the best tune for each model:
uv.prop.ml = train.models.performance(uv.prop.ds, c("pls","J48","JRip","svmLinear", "rf"), "seasons", "repeatedcv", metric = "ROC")
uv.prop.ml$performance
## Accuracy Kappa Sensitivity Specificity ROC AccuracySD
## pls 0.4212129 0.2278712 0.4245833 0.8068258 0.6825841 0.1476135
## J48 0.4141868 0.2160508 0.4114583 0.8041679 0.6156737 0.1259365
## JRip 0.3304400 0.1101806 0.3306250 0.7780669 0.5710988 0.1104275
## svmLinear 0.3930668 0.1928365 0.3995833 0.7980606 0.6943726 0.1194185
## rf 0.4466625 0.2632203 0.4483333 0.8161263 0.7212506 0.1223219
## KappaSD SensitivitySD SpecificitySD ROCSD
## pls 0.1959621 0.1478718 0.04926840 0.10738648
## J48 0.1679080 0.1273755 0.04199369 0.09088184
## JRip 0.1454247 0.1080973 0.03687530 0.09063051
## svmLinear 0.1572932 0.1208043 0.03949831 0.08844256
## rf 0.1623503 0.1232865 0.04068859 0.09493944
And the variable importance in the four season classes for all models:
summary.var.importance(uv.prop.ml, 10)
## $pls
## inv out pri ver Mean
## 293 100.00000 76.40859 56.223307 75.72783 77.08993
## 292 62.29331 83.92225 56.635394 53.63051 64.12036
## 288 71.91901 55.60418 45.684395 51.91305 56.28016
## 291 47.77205 77.36311 52.495940 43.85652 55.37191
## 295 60.34362 58.07781 52.314489 45.39531 54.03281
## 371 89.45628 33.19448 9.244135 73.58258 51.36937
## 338 91.44179 26.10419 8.388325 77.56437 50.87467
## 313 83.83256 40.81823 6.912971 61.55417 48.27948
## 290 43.23852 63.47245 49.977833 34.48543 47.79356
## 294 47.25424 55.28664 53.753629 34.62655 47.73026
##
## $J48
## inv out pri ver Mean
## 292 100.00000 73.21856 100.00000 67.00007 85.05466
## 288 98.38428 73.89123 98.38428 68.51787 84.79441
## 307 99.30755 77.25457 99.30755 62.66350 84.63329
## 295 100.00000 68.95832 100.00000 64.94020 83.47463
## 286 94.22956 73.44278 94.22956 68.08421 82.49653
## 293 99.30755 68.28565 99.30755 60.92887 81.95740
## 297 97.23019 70.07944 97.23019 61.47094 81.50269
## 287 94.34497 70.30366 94.34497 61.79618 80.19744
## 291 94.46038 65.37076 94.46038 65.69910 79.99765
## 308 89.84402 74.78812 89.84402 61.14569 78.90546
##
## $JRip
## Overall Mean
## 257 100 100
## 312 100 100
## 200 0 0
## 201 0 0
## 202 0 0
## 203 0 0
## 204 0 0
## 205 0 0
## 206 0 0
## 207 0 0
##
## $svmLinear
## inv out pri ver Mean
## 292 100.00000 73.21856 100.00000 67.00007 85.05466
## 288 98.38428 73.89123 98.38428 68.51787 84.79441
## 307 99.30755 77.25457 99.30755 62.66350 84.63329
## 295 100.00000 68.95832 100.00000 64.94020 83.47463
## 286 94.22956 73.44278 94.22956 68.08421 82.49653
## 293 99.30755 68.28565 99.30755 60.92887 81.95740
## 297 97.23019 70.07944 97.23019 61.47094 81.50269
## 287 94.34497 70.30366 94.34497 61.79618 80.19744
## 291 94.46038 65.37076 94.46038 65.69910 79.99765
## 308 89.84402 74.78812 89.84402 61.14569 78.90546
##
## $rf
## Overall Mean
## 288 100.00000 100.00000
## 312 73.97874 73.97874
## 307 73.51876 73.51876
## 309 72.75941 72.75941
## 257 67.14458 67.14458
## 308 40.54604 40.54604
## 293 39.32506 39.32506
## 287 37.12353 37.12353
## 352 36.40903 36.40903
## 310 36.35935 36.35935
FEATURE SELECTION
Using recursive feature selection, various subsets were used with random forests classifier. The results are shown below:
feature.selection.result = feature.selection(uv.prop.ds, "seasons", method="rfe", functions = rfFuncs, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.3599 0.1467 0.1371 0.1814
## 4 0.3718 0.1613 0.1260 0.1677
## 8 0.4295 0.2403 0.1566 0.2071
## 16 0.4532 0.2713 0.1329 0.1754
## 32 0.4696 0.2921 0.1397 0.1865 *
## 64 0.4538 0.2714 0.1286 0.1726
## 501 0.4610 0.2805 0.1303 0.1745
##
## The top 5 variables (out of 32):
## X288, X307, X293, X292, X296
plot(feature.selection.result, type=c("g","o"))
Also selection by filter was used with the results shown below:
feature.selection.result2 = feature.selection(uv.prop.ds, "seasons", method="filter", functions = rfSBF, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result2
##
## Selection By Filter
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance:
##
## Accuracy Kappa AccuracySD KappaSD
## 0.4326 0.244 0.112 0.148
##
## Using the training set, 51 variables were selected:
## X200, X201, X202, X203, X205...
##
## During resampling, the top 5 selected variables (out of a possible 248):
## X283 (100%), X285 (100%), X286 (100%), X287 (100%), X288 (100%)
##
## On average, 51.1 variables were selected (min = 26, max = 144)