setwd("~/Dropbox")
library(metabolomicsUM)
source("Datasets/Propolis/UVV/scripts/propolis_uv_metadata.R")
# reading data and metadata
uv.prop.metadata.file = "Datasets/Propolis/UVV/metadata/prop_uv_metadata.csv"
uv.prop.data.file = "Datasets/Propolis/UVV/data/uvv.csv"
get.metadata.uv(uv.prop.data.file, write.file = TRUE, file.name= uv.prop.metadata.file)
# need to check labels of x and values
label.x = "wavelength(nm)"
label.val = "absorbance"
uv.prop.ds = read.dataset.csv(uv.prop.data.file, uv.prop.metadata.file, description = "UV data for propolis", type = "uvv-spectra", label.x = label.x, label.values = label.val)
uv.prop.ds$metadata$years = factor(uv.prop.ds$metadata$years)
sum.dataset(uv.prop.ds)
## Dataset summary:
## Valid dataset
## Description: UV data for propolis
## Type of data: uvv-spectra
## Number of samples: 136
## Number of data points 501
## Number of metadata variables: 3
## Label of x-axis values: wavelength(nm)
## Label of data points: absorbance
## Number of missing values in data: 0
## Mean of data values: 71.93233
## Median of data values: 0.3097
## Standard deviation: 18529.24
## Range of values: 1e-04 4836665
## Quantiles:
## 0% 25% 50% 75% 100%
## 1.000000e-04 1.035000e-01 3.097000e-01 1.009725e+00 4.836665e+06
Seasons metadata used, no normalization used, smoothing, background, offset and baseline correction used.
PREPROCESSING
By plotting the entire spectrum we can see an outlier around the 500 and 600 wavelengths. The value will be replaced by linear interpolation.
no.seasons = which(is.na(uv.prop.ds$metadata$seasons))
uv.prop.ds = remove.samples(uv.prop.ds, no.seasons)
plot.spectra(uv.prop.ds,"seasons")
uv.prop.ds = replace.data.value(uv.prop.ds, 529, "ACpri11", NA)
plot.spectra(uv.prop.ds,"seasons")
sum.dataset(uv.prop.ds)
## Dataset summary:
## Valid dataset
## Description: UV data for propolis
## Type of data: uvv-spectra
## Number of samples: 133
## Number of data points 501
## Number of metadata variables: 3
## Label of x-axis values: wavelength(nm)
## Label of data points: absorbance
## Number of missing values in data: 1
## Mean of data values: 0.9414224
## Median of data values: 0.3098
## Standard deviation: 1.293135
## Range of values: 1e-04 4.4118
## Quantiles:
## 0% 25% 50% 75% 100%
## 0.0001 0.1046 0.3098 0.9885 4.4118
Smoothing interpolation used (loess), background, offset and baseline correction applied:
uv.prop.wavelens = get.x.values.as.num(uv.prop.ds)
x.axis.sm = seq(min(uv.prop.wavelens), max(uv.prop.wavelens),4)
uv.prop.smooth = smoothing.interpolation(uv.prop.ds, method = "loess", x.axis = x.axis.sm)
uv.prop.bg = data.correction(uv.prop.smooth,"background")
uv.prop.offset = data.correction(uv.prop.bg, "offset")
uv.prop.baseline = data.correction(uv.prop.offset, "baseline")
sum.dataset(uv.prop.baseline)
## Dataset summary:
## Valid dataset
## Description: UV data for propolis-smoothed with hyperSpec spc.loess; background correction; offset correction; baseline correction
## Type of data: undefined
## Number of samples: 133
## Number of data points 126
## Number of metadata variables: 3
## Label of x-axis values: wavelength(nm)
## Label of data points: absorbance
## Number of missing values in data: 0
## Mean of data values: 0.3979923
## Median of data values: 0.1782429
## Standard deviation: 0.6027572
## Range of values: -0.01268384 3.83529
## Quantiles:
## 0% 25% 50% 75% 100%
## -0.01268384 0.05968338 0.17824286 0.41410057 3.83529021
plot.spectra(uv.prop.baseline,"seasons")
UNIVARIATE TESTS
An analysis of variance (ANOVA) was conducted over the data with tukey test also, and this is the top 10 results ordered by p-value:
uv.prop.anova = aov.all.vars(uv.prop.baseline, "seasons")
uv.prop.anova[1:10,]
## pvalues logs fdr tukey
## 292 6.801742e-08 7.167380 8.570194e-06 out-inv; pri-inv; ver-out; ver-pri
## 296 1.537456e-07 6.813197 9.685976e-06 out-inv; pri-inv; ver-inv; ver-pri
## 288 3.457460e-06 5.461243 1.452133e-04 out-inv; pri-inv
## 300 2.610246e-05 4.583319 8.222274e-04 out-inv; pri-inv
## 308 1.055068e-04 3.976719 2.658772e-03 out-inv; pri-inv
## 216 2.368373e-04 3.625550 4.973582e-03 out-inv; pri-out; ver-out
## 304 3.461896e-04 3.460686 5.789306e-03 out-inv; pri-inv
## 212 3.675750e-04 3.434654 5.789306e-03 out-inv; ver-out
## 208 6.869430e-04 3.163079 9.617202e-03 out-inv; ver-out
## 204 1.010090e-03 2.995640 1.272714e-02 out-inv; ver-out
A heatmap with the correlations between all the variables is shown below:
uv.prop.correl = correlations.dataset(uv.prop.baseline, method = "pearson")
heatmap(uv.prop.correl, col = topo.colors(256))
CLUSTERING
Hierarchical clustering with euclidean distance and complete method was performed on the data and the resulting dendrogram is shown below:
uv.prop.hc = clustering(uv.prop.baseline, method = "hc", distance = "euclidean")
dendrogram.plot.col(uv.prop.baseline, uv.prop.hc, "seasons")
K-Means was performed on the data also with 4 centers and the results and the plot giving for each cluster the median of the samples in blue, and in grey the values of all samples in that cluster are shown below:
uv.prop.kmeans = clustering(uv.prop.baseline, method = "kmeans", num.clusters = 3)
kmeans.plot(uv.prop.baseline, uv.prop.kmeans)
kmeans.result.df(uv.prop.kmeans, 3)
## cluster
## 1 1
## 2 2
## 3 3
## samples
## 1 DCout10 FPout10 ITout10 PUout10 SAout10 SJCout10 XXout10 BGinv10 CEinv10 CNinv10 CNOinv10 FPinv10 ITinv10 JBinv10 SAinv10 SJCinv10 BGpri10 BR1pri10 DCpri10 FPpri10 ITpri10 JBpri10 SApri10 VRpri10 ACver10 DCver10 FPver10 SAver10 SJ2ver10 URver10 VRver10 ACout11 ANout11 BGout11 CNout11 DCout11 FPout11 PUout11 SAout11 SJCout11 VRout11 ACinv11 ANinv11 BGinv11 DCinv11 FPinv11 SAinv11 CNpri11 DCpri11 FPpri11 JBpri11 PUpri11 SJCpri11 BR2ver11 JBver11 SAver11 SJ2ver11 SJCver11 VRver11
## 2 ANout10 BGout10 BR1out10 JBout10 SJ1out10 SJ2out10 URout10 VRout10 ACinv10 ANinv10 BR2inv10 PUinv10 URinv10 XXinv10 ANpri10 SJ1pri10 SJ2pri10 URpri10 BGver10 BR1ver10 BR2ver10 JBver10 CEout11 SJ2out11 XXout11 BR2inv11 PUinv11 BGpri11 BR2pri11 BR1pri11 ITpri11 SJ1pri11 URpri11 ANver11 SJ2inv11 XXinv11 BGver11 FPver11 SJ1ver11 URver11
## 3 ACout10 BR2out10 CEout10 CNout10 CNOout10 DCinv10 ACpri10 BR2pri10 CEpri10 CNpri10 PUpri10 SJCpri10 XXpri10 ANver10 CEver10 CNver10 ITver10 PUver10 SJCver10 CNOout11 ITout11 BR1inv11 CEinv11 CNinv11 CNOinv11 ACpri11 CEpri11 VRpri11 SJCinv11 BR1ver11 CEver11 CNver11 ITver11 PUver11
PCA
Principal components analysis was performed on the data and some plots are shown below:
uv.prop.pca = pca.analysis.dataset(uv.prop.baseline, scale = T, center = T)
pca.pairs.plot(uv.prop.baseline, uv.prop.pca, "seasons")
pca.screeplot(uv.prop.pca)
pca.scoresplot2D(uv.prop.baseline, uv.prop.pca, "seasons", ellipses = T)
pca.kmeans.plot2D(uv.prop.baseline, uv.prop.pca, ellipses = T, num.clusters = 4, labels = T)
MACHINE LEARNING
For classification models and prediction the following parameters were used: - models: PLS, J48, JRip, SVM and Random Forests - validation method: repeated cross-validation - number of folds: 10 - number of repeats: 10
Below are some results with the best tune for each model:
uv.prop.ml = train.models.performance(uv.prop.baseline, c("pls","J48","JRip", "svmLinear", "rf"), "seasons", "repeatedcv", metric = "ROC")
uv.prop.ml$performance
## Accuracy Kappa Sensitivity Specificity ROC
## pls 0.4261310 0.23606358 0.4310417 0.809077 0.7178400
## J48 0.4290348 0.23907557 0.4339583 0.809673 0.6286851
## JRip 0.3041520 0.07501787 0.3075000 0.768851 0.5563085
## svmLinear 0.3908040 0.18735540 0.3954167 0.796524 0.6898253
## rf 0.4479803 0.26500627 0.4529167 0.816154 0.7334987
## AccuracySD KappaSD SensitivitySD SpecificitySD ROCSD
## pls 0.13685650 0.1826340 0.13764261 0.04587005 0.09961535
## J48 0.13367945 0.1778895 0.13413764 0.04488273 0.09680019
## JRip 0.09728985 0.1277377 0.09799119 0.03218647 0.08693625
## svmLinear 0.11986582 0.1590576 0.12046268 0.03988013 0.08561050
## rf 0.13767307 0.1816980 0.13558637 0.04575612 0.09179013
And the variable importance in the four season classes for all models:
summary.var.importance(uv.prop.ml, 10)
## $pls
## inv out pri ver Mean
## 292 100.00000 37.92341 59.096953 16.329493 53.33746
## 296 89.70545 20.45311 49.126753 20.039638 44.83124
## 200 43.05608 81.60970 24.524007 28.211584 44.35034
## 204 39.00974 75.75032 21.064560 21.890087 39.42868
## 208 38.00974 72.42821 18.930618 21.016452 37.59626
## 336 40.22022 38.43801 6.625690 59.065199 36.08728
## 288 73.91810 20.00711 40.649629 6.741464 35.32908
## 348 38.56027 33.95970 7.962277 59.752964 35.05880
## 284 43.28275 49.31446 22.050529 25.423847 35.01790
## 212 31.87208 73.78001 18.430948 15.029046 34.77802
##
## $J48
## inv out pri ver Mean
## 296 100.00000 77.21006 100.00000 59.56356 84.19340
## 292 96.12917 77.71142 96.12917 58.34367 82.07836
## 288 93.80668 81.47165 93.80668 56.88917 81.49355
## 308 91.48418 71.94573 91.48418 63.81756 79.68292
## 216 83.47711 83.88731 60.30377 83.88731 77.88888
## 300 92.77446 65.42800 92.77446 60.09531 77.76806
## 212 85.98393 85.98393 56.87532 82.00340 77.71164
## 208 89.99484 89.99484 49.56131 73.99677 75.88694
## 220 79.96756 79.96756 59.16095 73.29030 73.09659
## 304 87.87141 51.89117 87.87141 53.71432 70.33708
##
## $JRip
## Overall Mean
## 308 100 100
## 336 100 100
## 200 0 0
## 204 0 0
## 208 0 0
## 212 0 0
## 216 0 0
## 220 0 0
## 224 0 0
## 228 0 0
##
## $svmLinear
## inv out pri ver Mean
## 296 100.00000 77.21006 100.00000 59.56356 84.19340
## 292 96.12917 77.71142 96.12917 58.34367 82.07836
## 288 93.80668 81.47165 93.80668 56.88917 81.49355
## 308 91.48418 71.94573 91.48418 63.81756 79.68292
## 216 83.47711 83.88731 60.30377 83.88731 77.88888
## 300 92.77446 65.42800 92.77446 60.09531 77.76806
## 212 85.98393 85.98393 56.87532 82.00340 77.71164
## 208 89.99484 89.99484 49.56131 73.99677 75.88694
## 220 79.96756 79.96756 59.16095 73.29030 73.09659
## 304 87.87141 51.89117 87.87141 53.71432 70.33708
##
## $rf
## Overall Mean
## 292 100.00000 100.00000
## 296 91.51684 91.51684
## 212 69.90052 69.90052
## 300 68.09065 68.09065
## 288 55.52845 55.52845
## 308 51.35390 51.35390
## 304 47.99090 47.99090
## 324 45.96293 45.96293
## 312 45.17458 45.17458
## 336 44.07122 44.07122
FEATURE SELECTION
Using recursive feature selection, various subsets were used with random forests classifier. The results are shown below:
feature.selection.result = feature.selection(uv.prop.baseline, "seasons", method="rfe", functions = rfFuncs, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.3774 0.1687 0.1260 0.1682
## 4 0.3786 0.1704 0.1194 0.1577
## 8 0.3948 0.1939 0.1025 0.1376
## 16 0.4003 0.2005 0.1284 0.1720
## 32 0.4169 0.2233 0.1123 0.1510
## 64 0.4118 0.2152 0.1258 0.1700
## 126 0.4329 0.2442 0.1435 0.1923 *
##
## The top 5 variables (out of 126):
## X292, X288, X296, X300, X212
plot(feature.selection.result, type=c("g","o"))
Also selection by filter was used with the results shown below:
feature.selection.result2 = feature.selection(uv.prop.baseline, "seasons", method="filter", functions = rfSBF, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result2
##
## Selection By Filter
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance:
##
## Accuracy Kappa AccuracySD KappaSD
## 0.4315 0.2424 0.1466 0.1942
##
## Using the training set, 33 variables were selected:
## X200, X204, X208, X212, X216...
##
## During resampling, the top 5 selected variables (out of a possible 53):
## X200 (100%), X204 (100%), X208 (100%), X212 (100%), X216 (100%)
##
## On average, 30.2 variables were selected (min = 21, max = 40)