22 maja 2015
"Field of study that gives computers the ability to learn without being explicitly programmed"
Arthur Samuel, 1959
Obecnie uczenie maszynowe zajmuje się głównie rozwiązywaniem praktycznych problemów związanych z wieloma różnymi aspektami, od rozpoznawania mowy czy pisma, poprzez klasyfikacje obrazów, rekomendacje produktów, do diagnoz medycznych
| data | stacja | poziom | gdd | tmax | tmin | tmean | precip | precip_type | |
|---|---|---|---|---|---|---|---|---|---|
| 135 | 1996-05-14 | Poznań | wysokie | 263.65 | 16.450 | 7.775 | 11.675 | 2.2250 | O |
| 136 | 1996-05-15 | Poznań | wysokie | 276.15 | 20.250 | 8.950 | 14.000 | 1.4500 | O |
| 137 | 1996-05-16 | Poznań | niskie | 286.25 | 21.975 | 9.675 | 15.175 | 0.0000 | O |
| 138 | 1996-05-17 | Poznań | niskie | 294.55 | 21.650 | 9.925 | 14.950 | 0.0000 | O |
| 139 | 1996-05-18 | Poznań | niskie | 304.95 | 20.475 | 10.175 | 14.475 | 0.0025 | W |
library('rpart')
library('kernlab')
rpart1 <- rpart(poziom ~ gdd+tmax+tmin+tmean+precip+precip_type, data=dane)
input <- as.matrix(dane[c('gdd', 'tmax', 'tmin', 'tmean', 'precip')])
output <- dane$poziom
ksvm1 <- ksvm(x=input, y=output, prob.model=TRUE)
W niektórych pakietach możliwy do użycia jest tylko jeden z rodzajów interfejsu.
Podobnie, poszczególne pakiety wymagają różnych parametrów, np. przy wyborze prawdopodobieństwa zajścia klasy jako typu predykcji. Może to być 'prob', 'probabilities', 'prosterior', 'raw', 'response'
predict(rpart1, type='prob') predict(ksvm1, input, type='probabilities')
install.packages('caret') #albo
devtools::install_github('topepo/caret/pkg/caret')
library('caret')
set.seed(75) sample_size <- floor(0.75 * nrow(dane)) train_index1 <- sample(seq_len(nrow(dane)), size = sample_size) training1 <- dane[c(train_index1), ] testing1 <- dane[c(-train_index1), ] table(training1$poziom)
## ## niskie wysokie ## 1849 323
table(testing1$poziom)
## ## niskie wysokie ## 638 86
library('caret')
set.seed(22)
train_index2 <- createDataPartition(dane$poziom, p=0.75, list=FALSE)
training2 <- dane[c(train_index2), ]
testing2 <- dane[c(-train_index2), ]
table(training2$poziom)
## ## niskie wysokie ## 1866 307
table(testing2$poziom)
## ## niskie wysokie ## 621 102
createDataPartition(y, times = 1, p = 0.5, list = TRUE, groups = min(5, length(y))) createResample(y, times = 10, list = TRUE) createFolds(y, k = 10, list = TRUE, returnTrain = FALSE) createMultiFolds(y, k = 10, times = 5) createTimeSlices(y, initialWindow, horizon = 1, fixedWindow = TRUE, skip = 0)
train(x, y,
method = "rf",
preProcess = NULL,
...,
weights = NULL,
metric = ifelse(is.factor(y), "Accuracy", "RMSE"),
maximize = ifelse(metric == "RMSE", FALSE, TRUE),
trControl = trainControl(),
tuneGrid = NULL,
tuneLength = 3)
train(x=training2[4:9], y=training2$poziom)
albo
train(poziom~gdd+tmax+tmin+tmean+precip+precip_type, data=training2)
mtryGrid <- expand.grid(mtry = c(1, 3, 5, 7)) model_rf_tuned <- train(x=input, y=output, method="rf", tuneGrid=mtryGrid) model_rf_tuned
## Random Forest ## ## 2173 samples ## 6 predictors ## 2 classes: 'niskie', 'wysokie' ## ## No pre-processing ## Resampling: Bootstrapped (25 reps) ## ## Summary of sample sizes: 2173, 2173, 2173, 2173, 2173, 2173, ... ## ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 1 0.9371586 0.7251532 0.007670486 0.02762596 ## 3 0.9451672 0.7731765 0.006674176 0.02670697 ## 5 0.9432904 0.7654370 0.006734001 0.02688989 ## 7 0.9419338 0.7599992 0.007329568 0.02819884 ## ## Accuracy was used to select the optimal model using the largest value. ## The final value used for the model was mtry = 3.
trainControl(method = "boot",
number = ifelse(grepl("cv", method), 10, 25),
repeats = ifelse(grepl("cv", method), 1, number),
p = 0.75,
initialWindow = NULL,
horizon = 1,
fixedWindow = TRUE,
verboseIter = FALSE,
returnData = TRUE,
returnResamp = "final",
savePredictions = FALSE,
classProbs = FALSE,
summaryFunction = defaultSummary,
selectionFunction = "best",
preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5),
index = NULL,
indexOut = NULL,
timingSamps = 0,
predictionBounds = rep(FALSE, 2),
seeds = NA,
adaptive = list(min = 5, alpha = 0.05,
method = "gls", complete = TRUE),
trim = FALSE,
allowParallel = TRUE)
ctrl1 <- trainControl(method = "none") model1 <- train(x=input, y=output, trControl = ctrl1, tuneGrid=data.frame(mtry=2)) model1
## Random Forest ## ## 3732 samples ## 6 predictors ## 2 classes: 'niskie', 'wysokie' ## ## No pre-processing ## Resampling: None
ctrl2 <- trainControl(method = "repeatedcv", number=10, repeats=5, classProbs=TRUE) model2 <- train(x=input, y=output, trControl = ctrl2) model2
## Random Forest ## ## 3732 samples ## 6 predictors ## 2 classes: 'niskie', 'wysokie' ## ## No pre-processing ## Resampling: Cross-Validated (10 fold, repeated 5 times) ## ## Summary of sample sizes: 3359, 3359, 3359, 3359, 3358, 3360, ... ## ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 2 0.9776520 0.9553048 0.008606891 0.01721103 ## 4 0.9776519 0.9553042 0.008470490 0.01693926 ## 6 0.9779193 0.9558389 0.008775563 0.01754886 ## ## Accuracy was used to select the optimal model using the largest value. ## The final value used for the model was mtry = 6.
Dostępne metody to 'boot', 'boot632', 'cv', 'repeatedcv', 'LOOCV', 'LGOCV', 'none', 'oob', 'adaptive_cv', 'adaptive_boot' or 'adaptive_LGOCV'
'boot':
Wybór optymalnego modelu opiera się głównie na podstawie trzech argumentów:
1. metric z funkcji train 2. maximize z funkcji train 3. selectionFunction z funkcji trainControl
metric to argument określający parametr na podstawie którego wybierany będzie optymalny model:
- dla modeli regresyjnych jest to domyślnie 'RMSE' - dla modeli klasyfikacyjnych jest to domyślnie 'Accuracy' - domyślnie można określić inne parametry, tj. "Rsquared" dla modeli regresyjnych oraz "Kappa" dla modeli klasyfikacyjnych - możliwe jest także zdefiniowanie innych parametrów
Argument maximize ustala czy wartość parametru powinna być jak największa (TRUE) czy jak najmniejsza (FALSE)
model_kappa <- train(x=input, y=output, metric='Kappa') model_kappa
## Random Forest ## ## 3732 samples ## 6 predictors ## 2 classes: 'niskie', 'wysokie' ## ## No pre-processing ## Resampling: Bootstrapped (25 reps) ## ## Summary of sample sizes: 3732, 3732, 3732, 3732, 3732, 3732, ... ## ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 2 0.9737878 0.9475519 0.004977758 0.009955975 ## 4 0.9738723 0.9477241 0.005159715 0.010306441 ## 6 0.9730624 0.9461044 0.004610004 0.009199731 ## ## Kappa was used to select the optimal model using the largest value. ## The final value used for the model was mtry = 4.
selectionFunction pozwala na wybranie jednej z trzech opcji:
- best - domyślna opcja - wybiera parametr, który ma najmniejszą/największą wartość - oneSE - "one standard error" - wybierany jest najprostszy model model w zasięgu jednego błędu standardowego od modelu o największej wartości wybranego parametru - tolerance - wybierany jest najprostszy model model w zasięgu wybranego procenta tolerancji od modelu o największej wartości wybranego parametru
?caret::best
ctrl_kfold1 <- trainControl(
method = 'repeatedcv',
number = 10,
repeats = 100,
savePredictions = TRUE,
classProbs= TRUE,
selectionFunction = "tolerance")
ctrl_kfold2 <- trainControl(
method = 'repeatedcv',
number = 10,
repeats = 100,
savePredictions = TRUE,
classProbs= TRUE,
selectionFunction = "oneSE")
Może ona działać na dwa sposoby:
1. funkcja preProcess szacuje odpowiednie parametry a następnie na podstawie tych parametrów wykonywana jest predykcja przetworzonej zmiennej 2. argumenty funkcji preProcess są podawane w funkcji train
?caret::preProcess
pre_proc <- preProcess(training2[c(4:8)], method = c("center", "scale"))
pre_proc_train <- predict(pre_proc, training2[c(4:8)])
pre_proc_test <- predict(pre_proc, testing2[c(4:8)])
model_pre_proc <- train(poziom~gdd+tmax+tmin+tmean+precip+precip_type, data=training2, preProcess = c("center", "scale"))
model_pre_proc
## Random Forest ## ## 2173 samples ## 9 predictors ## 2 classes: 'niskie', 'wysokie' ## ## Pre-processing: centered, scaled ## Resampling: Bootstrapped (25 reps) ## ## Summary of sample sizes: 2173, 2173, 2173, 2173, 2173, 2173, ... ## ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 2 0.9452616 0.7700730 0.007230072 0.02771344 ## 4 0.9459239 0.7738053 0.006404059 0.02416959 ## 7 0.9435259 0.7643025 0.008142519 0.03100224 ## ## Accuracy was used to select the optimal model using the largest value. ## The final value used for the model was mtry = 4.
summary(training3[c(4:6)])
## gdd tmax tmin ## Min. : 0.00 Min. :-11.600 Min. :-21.475 ## 1st Qu.: 17.49 1st Qu.: 3.688 1st Qu.: -2.025 ## Median : 103.58 Median : 11.438 Median : 2.575 ## Mean : 241.15 Mean : 11.542 Mean : 2.607 ## 3rd Qu.: 396.94 3rd Qu.: 19.400 3rd Qu.: 8.050 ## Max. :2499.40 Max. : 31.500 Max. : 18.150 ## NA's :27 NA's :23 NA's :21
inpute <- preProcess(training3[c(4:6)], method = 'medianImpute') inpute_train <- predict(inpute, training3[c(4:6)])
summary(inpute_train)
## gdd tmax tmin ## Min. : 0.00 Min. :-11.60 Min. :-21.475 ## 1st Qu.: 18.75 1st Qu.: 3.80 1st Qu.: -2.000 ## Median : 103.58 Median : 11.44 Median : 2.575 ## Mean : 239.44 Mean : 11.54 Mean : 2.607 ## 3rd Qu.: 392.20 3rd Qu.: 19.32 3rd Qu.: 7.975 ## Max. :2499.40 Max. : 31.50 Max. : 18.150
dummy_object <- dummyVars(~., data = dane[4:9], levelsOnly = TRUE) dane_dv <- predict(dummy_object, dane)
head(dane_dv)
## gdd tmax tmin tmean precip O S W ## 5 0 -3.675 -8.475 -6.025 0.3525 0 1 0 ## 6 0 -2.425 -6.825 -4.675 0.3750 0 1 0 ## 7 0 -1.700 -5.600 -3.875 0.3750 1 0 0 ## 8 0 -1.275 -5.125 -3.450 0.3025 0 0 1 ## 9 0 -0.750 -4.450 -2.350 0.6775 0 0 1 ## 10 0 0.525 -2.875 -1.000 0.6525 1 0 0
cor_matrix <- cor(training2[c(4:8)]) high_cor_index <- findCorrelation(cor_matrix, cutoff = .98) input_no_high_cor <- training2[c(4:8)][, -high_cor_index] input_no_high_cor <- cor(input_no_high_cor) summary(input_no_high_cor)
## gdd tmax tmin precip ## Min. :0.1036 Min. :0.1059 Min. :0.2236 Min. :0.1036 ## 1st Qu.:0.5333 1st Qu.:0.5354 1st Qu.:0.5633 1st Qu.:0.1054 ## Median :0.6775 Median :0.8110 Median :0.8101 Median :0.1648 ## Mean :0.6147 Mean :0.6820 Mean :0.7109 Mean :0.3583 ## 3rd Qu.:0.7589 3rd Qu.:0.9577 3rd Qu.:0.9577 3rd Qu.:0.4177 ## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
summary(training2$poziom)
## niskie wysokie ## 1866 307
dane_upsampled <- upSample(x = training2[c(4:9)], y=training2$poziom, yname = 'poziom') summary(dane_upsampled$poziom)
## niskie wysokie ## 1866 1866
dane_downsampled <- downSample(x = training2[c(4:9)], y=training2$poziom, yname = 'poziom') summary(dane_downsampled$poziom)
## niskie wysokie ## 307 307
ctrl <- trainControl(method = "boot",
classProbs = TRUE,
summaryFunction = twoClassSummary)
suma_wys <- sum(output == "wysokie")
set.seed(2)
rfDownsampled <- train(poziom~tmax+tmin+tmean+precip+precip_type, data=training2,
method = "rf",
ntree = 1500,
tuneLength = 5,
metric = "ROC",
trControl = ctrl,
strata = training2$poziom,
sampsize = rep(suma_wys, 2))
library('doMC') # niedostępne pod Windowsem
registerDoMC(cores = 3)
model <- train(x=input, y=output, method = "rf")
library('microbenchmark')
model_fun1 <- function() {
model <- train(x=input, y=output, method = "rf")
}
microbenchmark(model_fun1(), times=10L)
## Unit: seconds ## expr min lq mean median uq max neval ## model_fun1() 133.8864 134.8692 135.3923 135.4181 136.2773 136.6912 10
model_fun2 <- function() {
library('doMC')
registerDoMC(cores = 12)
model <- train(x=input, y=output, method = "rf")
}
microbenchmark(model_fun2(), times=10L)
## Unit: seconds ## expr min lq mean median uq max neval ## model_fun2() 17.21115 17.49581 17.79191 17.77224 18.0512 18.45868 10
library('doParallel')
set.seed(123)
seeds <- vector(mode = "list", length = 11) #length is = (n_repeats*nresampling)+1
for(i in 1:10) seeds[[i]]<- sample.int(n=1000, 3)
seeds[[11]]<-sample.int(1000, 1)
ctrl <- trainControl(method='cv', seeds=seeds, index=createFolds(output))
cl <- makeCluster(12)
registerDoParallel(cl)
model1 <- train(x=input, y=output, method='rf', trControl=ctrl)
model2 <- train(x=input, y=output, method='rf', trControl=ctrl)
stopCluster(cl)
all.equal(predict(model1, type='prob'), predict(model2, type='prob'))
## [1] TRUE
model2
## Random Forest ## ## 2173 samples ## 6 predictors ## 2 classes: 'niskie', 'wysokie' ## ## No pre-processing ## Resampling: Cross-Validated (10 fold) ## ## Summary of sample sizes: 217, 216, 218, 218, 217, 217, ... ## ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 2 0.9269318 0.6780849 0.006737710 0.03912803 ## 4 0.9318908 0.7090616 0.005540656 0.02270425 ## 6 0.9302549 0.7063494 0.006774623 0.02374984 ## ## Accuracy was used to select the optimal model using the largest value. ## The final value used for the model was mtry = 4.
plot(model2)
model2$finalModel
## ## Call: ## randomForest(x = x, y = y, mtry = param$mtry) ## Type of random forest: classification ## Number of trees: 500 ## No. of variables tried at each split: 4 ## ## OOB estimate of error rate: 5.11% ## Confusion matrix: ## niskie wysokie class.error ## niskie 1814 52 0.0278671 ## wysokie 59 248 0.1921824
rpart_model <- train(x=input, y=output, method='rpart') plot(rpart_model$finalModel) text(rpart_model$finalModel)
confusionMatrix(model2)
## Cross-Validated (10 fold) Confusion Matrix ## ## (entries are percentages of table totals) ## ## Reference ## Prediction niskie wysokie ## niskie 83.0 4.0 ## wysokie 2.8 10.1
varImp(model2)
albo
plot(varImp(model2))
predykcja <- predict(model2, testing2[c(4:9)]) confusionMatrix(testing2$poziom, predykcja)
## Confusion Matrix and Statistics ## ## Reference ## Prediction niskie wysokie ## niskie 607 14 ## wysokie 16 86 ## ## Accuracy : 0.9585 ## 95% CI : (0.9413, 0.9718) ## No Information Rate : 0.8617 ## P-Value [Acc > NIR] : <2e-16 ## ## Kappa : 0.8274 ## Mcnemar's Test P-Value : 0.8551 ## ## Sensitivity : 0.9743 ## Specificity : 0.8600 ## Pos Pred Value : 0.9775 ## Neg Pred Value : 0.8431 ## Prevalence : 0.8617 ## Detection Rate : 0.8396 ## Detection Prevalence : 0.8589 ## Balanced Accuracy : 0.9172 ## ## 'Positive' Class : niskie ##
library('pROC')
predykcja <- predict(model2, testing2[c(4:9)], type="prob")
roc <- roc(testing2$poziom, predykcja$wysokie)
plot(roc, print.thres="best", print.thres.best.method="closest.topleft")
## ## Call: ## roc.default(response = testing2$poziom, predictor = predykcja$wysokie) ## ## Data: predykcja$wysokie in 621 controls (testing2$poziom niskie) < 102 cases (testing2$poziom wysokie). ## Area under the curve: 0.9825
model_rpart <- train(x=input, y=output, method = 'rpart') model_rf <- train(x=input, y=output, method = 'rf') model_svm <- train(x=input, y=output, method = 'svmLinear')
resamps <- resamples(list(RPART = model_rpart,
RF = model_rf,
SVM = model_svm))
resamps
## ## Call: ## resamples.default(x = list(RPART = model_rpart, RF = model_rf, SVM ## = model_svm)) ## ## Models: RPART, RF, SVM ## Number of resamples: 25 ## Performance metrics: Accuracy, Kappa ## Time estimates for: everything, final model fit
summary(resamps)
## ## Call: ## summary.resamples(object = resamps) ## ## Models: RPART, RF, SVM ## Number of resamples: 25 ## ## Accuracy ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## RPART 0.8966 0.9088 0.9156 0.9154 0.9209 0.9318 0 ## RF 0.9669 0.9720 0.9740 0.9746 0.9779 0.9815 0 ## SVM 0.7822 0.7967 0.8027 0.8003 0.8057 0.8131 0 ## ## Kappa ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## RPART 0.7932 0.8177 0.8310 0.8308 0.8420 0.8637 0 ## RF 0.9338 0.9439 0.9481 0.9493 0.9557 0.9630 0 ## SVM 0.5636 0.5952 0.6069 0.6009 0.6114 0.6263 0
bwplot(resamps, layout = c(3, 1))
dotplot(resamps, metric = "Accuracy")
splom(resamps)
Niestandardowe modele - http://topepo.github.io/caret/custom_models.html
Resampling adaptacyjny (ang. Adaptive resampling) - http://arxiv.org/abs/1405.6974
Wybór zmiennych (ang. Feature selection) - http://topepo.github.io/caret/featureselection.html
Modele zespolone (ang. Ensemble models) - https://github.com/zachmayer/caretEnsemble
Zdjęcia: