저장소

R 앙상블_시계열, 배깅, 부스팅, 랜덤포레스트 본문

교육/빅데이터 청년인재_경희대 R

R 앙상블_시계열, 배깅, 부스팅, 랜덤포레스트

HB HB 2019. 7. 28. 10:30

앙상블_시계열, 배깅(bagging), 부스팅(boosting), 랜덤포레스트(randomForest)

###시계열 예측 이해와 활용


str(AirPassengers)


AirPassengers


plot(AirPassengers)


apts<- ts(AirPassengers,frequency = 12) #frequency=12 : 월단위로 쪼개겠다

f<- decompose(apts)

plot(f)


install.packages('tseries')

install.packages("forecast")


library(tseries)

library(forecast)


plot(stl(AirPassengers,s.window='periodic')) #데이터르셋을 시즌, 트렌트,랜덤으로 분해




##위 두 플랫모두 같은 걸 보여줌. 다른 방식임을 설명하는 것.


adf.test(diff(log(AirPassengers)),alternative='stationary',k=0) 

#diff로 차분하고 log로 변환 안정적여부 묻기


adf.test((AirPassengers),alternative='stationary',k=0) c/



auto.arima(AirPassengers)

#arima pdq / p:AR모형의 p차수 d:차분차수 q:MA모형의 차수


fit<-arima(AirPassengers,order=c(2,1,1),list(order=c(0,1,0),period=12))

fore<- predict(fit,n.ahead=24) #24개월 데이터 예측

U<-fore$pred+2*fore$se

L<-fore$pred-2*fore$se

ts.plot(AirPassengers,fore$pred,U,L,col=c(1,2,4,4),lty=c(1,1,2,2))

legend('topleft',c('Actual','Forecast','Error Bounds (95% Confidence)'),col=c(1,2,4),lty = c(1,1,2,2)) #legend 범례


auto.arima(diff(log(AirPassengers),alternative=))



#영국 내 월별 폐질환 사망자에 관한 시계열 자료

ldeaths

plot(ldeaths)

ldeaths.decompose <- decompose(ldeaths)

ldeaths.decompose$seasonal

plot(ldeaths.decompose)

ldeaths.decompose.adj <- ldeaths - ldeaths.decompose$seasonal

plot(ldeaths.decompose.adj)


Nile

plot(Nile)

Nile.diff1 <- diff(Nile, differences  = 1) #diff 차분

plot(Nile.diff1)

Nile.diff2 <- diff(Nile, differences = 2)

plot(Nile.diff2)

acf(Nile.diff2,lag.max=20)

acf(Nile.diff2,lag.max=20, plot=F) #lag.max 시계열 20등분하겠다

pacf(Nile.diff2,lag.max=20)

pacf(Nile.diff2,lagmax=20,plot=F)


auto.arima(Nile)

Nile.arima <- arima(Nile,order=c(1,1,1))

Nile.arima

Nile.forecast<- forecast(Nile.arima,h=10)

Nile.forecast

plot(Nile.forecast)


airquality

ozon=airquality$Ozone

plot(ozon)

ozon.diff <- diff(ozon,differences=1)

plot(ozon.diff)


auto.arima(ozon.diff)

ozon.arima <- arima(ozon.diff,order=c(3,0,0))

ozon.arima

ozon.fore <- forecast(ozon.arima,h=10)

plot(ozon.fore)


###앙상블 모형



##bagging

install.packages('adabag')

library(adabag)

data(iris)

iris.bagging <- bagging(Species~.,data=iris,mfinal=10) #mfinal 반복수(모형갯수), 디폴트=100

iris.bagging$importance #중요 비율을 보여줌 

plot(iris.bagging$trees[[10]]) #trees : 배깅기법으로 만든 의사결정나무 / 10번째 tree 값

text(iris.bagging$trees[[10]])


pred <- predict(iris.bagging,newdata=iris) #bagging으로 예측

table(pred$class,iris[,5])


##부스팅

boo.adabag <- boosting(Species~.,data=iris,boos=T, mfinal=10)

boo.adabag$importance

plot(boo.adabag$tree[[8]])

text(boo.adabag$tree[[8]])


pred <- predict(boo.adabag,newdata=iris)

tb <- table(pred$class, iris[,5])

tb


#오분류율

1-(sum(diag(tb))/sum(tb))


##nnet함수로 분석

install.packages('ada')

library(ada)

iris <- iris[iris$Species!="setosa",]

n <-dim(iris)[1]

trind <-sample(1:100, floor(.6*100), FALSE)

teind <- setdiff(1:100, trind) #set difference 차집합

iris[,5]<-as.factor((levels(iris[,5])[2:3])[as.numeric(iris[,5])-1])

gdis <- ada(Species~., data=iris[trind,],iter=20,nu=1,type='discrete') #iter 학습횟수 nu부스팅을 위한 축소모수

gdis <- addtest(gdis,iris[teind,-5],iris[teind,5])

gdis

plot(gdis,TRUE,TRUE)

pairs(gdis,iris[trind,-5],maxvar=4)



##랜덤포레스트

install.packages('randomForest')

library(randomForest)

stagec=na.omit(stagec)

t_index=sample(1:nrow(stagec),size=nrow(stagec)*0.7)

train=stagec[t_index,]

test=stagec[-t_index,]

rf <- randomForest(ploidy~.,data=train,ntree=100,proximity=TRUE)

table(predict(rf),train$ploidy)

print(rf)

plot(rf)

importance(rf)

varImpPlot(rf)


rf.pred <- predict(rf,newdata=test)

table(rf.pred,test$ploidy)


plot(margin(rf)) #마진이랑 ROC랑 비슷

library(e1071)

library(Epi)

ROC(test=rf.pred,stat=test$ploidy,plot="ROC",AUC=T)

install.packages('party')

library(party)


#랜덤포레스트는 party 패키지의 cforest()로도 이용할 수 있음.

set.seed(1234) #set.seed 랜덤 고정

cf <- cforest(ploidy~.,data=train)

cf.pred <- predict(cf,newdata=test, OOB=T, type='response')

 


Comments