저장소

R 의사결정나무, 인공신경망 본문

교육/빅데이터 청년인재_경희대 R

R 의사결정나무, 인공신경망

HB HB 2019. 7. 26. 09:30

###의사결정나무

auto=read.csv('autoparts.csv')

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

auto2=auto1[auto1$c_thickness<1000,]

auto2$y_falty=ifelse((auto2$c_thickness<20)|(auto2$c_thickness>32),1,0) #불량(1)에 관심==positive


t_index=sample(1:nrow(auto2),size=nrow(auto1)*0.7)

train=auto2[t_index,]

test=auto2[-t_index,]


nrow(train);nrow(test)

head(train)


install.packages('tree')

library(tree)

m=tree(factor(y_falty)~fix_time+a_speed+b_speed+separation+s_separation+

         rate_terms+mpa+load_time+highpressure_time,data=train)

plot(m)

text(m)


#가지치기


prune.m=prune.tree(m,method='misclass') #misclass 잘못된분류

plot(prune.m)


prune.m9=prune.tree(m,best=9) #가짓수 9개

plot(prune.m9)

text(prune.m9)


prune.m3=prune.tree(m,best=3)

plot(prune.m3)

text(prune.m3)


#분류모형 정확도 검증_크로스테이블 만들기

yhat_test=predict(m,test,type='class') #분류이기 때문에 타입이 class

table=table(real=test$y_falty,predict=yhat_test);table

(table[1,1]+table[2,2])/sum(table) #정분류율


yhat_test=predict(prune.m3,test,type='class') #분류이기 때문에 타입이 class

table=table(real=test$y_falty,predict=yhat_test);table

(table[1,1]+table[2,2])/sum(table)


yhat_test=predict(prune.m9,test,type='class') #분류이기 때문에 타입이 class

table=table(real=test$y_falty,predict=yhat_test);table

(table[1,1]+table[2,2])/sum(table)


#ROC, AUC - ROC커브 그린 후 AUC값 꼭 확인하기

library(Epi)

ROC(test=yhat_test,stat=test$y_falty,plot='ROC',AUC=T,main='TREE')




#예측

new.data=data.frame(fix_time=87,a_speed=0.609,b_speed=1.715,separation=242.7,

                    s_separation=657.5,rate_terms=95,mpa=78,load_time=18.1,highpressure_time=82)

predict(m,newdata = new.data,type='class')


new.data=data.frame(fix_time=c(87,85.6),a_speed=c(0.609,0.472),b_speed=c(1.715,1.685),

                    separation=c(242.7,243.4),s_separation=c(657.5,657.9),rate_terms=c(97,95),

                    mpa=c(78,28.8),load_time=c(18.1,18.2),highpressure_time=c(82,60))

predict(m,newdata = new.data,type='class')


new.data=data.frame(fix_time=test$fix_time,a_speed=test$a_speed,b_speed=test$b_speed,

                    separation=test$separation,s_separation=test$s_separation,

                    rate_terms=test$rate_terms,mpa=test$mpa,load_time=test$load_time,

                    highpressure_time=test$highpressure_time)

predict(m,newdata=new.data,tyle='class')




#종속변수 범주가 다항인 경우(2만 정상, 1과3은 불량)

auto2$gclass=as.factor(ifelse(auto2$c_thickness<20,1,ifelse(auto2$c_thickness<32,2,3)))

t_index=sample(1:nrow(auto2),size=1:nrow(auto2)*0.7)

train=auto2[t_index,]

test=auto2[-t_index,]



##############################안되는 부분

m<-tree(gclass~fix_time+a_speed+b_speed+separation+s_separation+

         rate_terms+mpa+load_time+highpressure_time,data=train)




yhat_test=predict(m,test,type='class')

table=table(real=test$g_class,predict=yhat_test

(table[1,1]+table[2,2])/sum(table)



m=tree(c_thickness~fix_time+a_speed+b_speed+separation+s_separation+

         rate_terms+mpa+load_time+highpressure_time,data=train)

plot(m)

text(m)


########################################################


#mse: 예측값-실제값==오차 제곱해서 평균

mse=mean((yhat_test-test$c_thickness)^2)



#-------------------------------


###K-근접 이웃 분류(K-NN)


auto=read.csv('autoparts.csv')

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

auto2=auto1[auto1$c_thickness<1000,]

auto2$y_faulty=ifelse((auto2$c_thickness<20)|(auto2$c_thickness>32),1,0)


t_index=sample(1:nrow(auto2),size=nrow(auto2)*0.7)

train=auto2[t_index,]

test=auto2[-t_index,]



xmat.train=as.matrix(train[1:9]) #1열부터 9열까지 매트릭스화 처리

y_faulty.train=train$y_faulty

xmat.test=as.matrix(test[1:9])

head(xmat.test)


install.packages('class')

library(class)


yhat_test=knn(xmat.train,xmat.test,as.factor(y_faulty.train),k=3)

yhat_test #테스트 데이터 셋의 종속변수 모음


table=table(real=test$y_faulty,predict=yhat_test)

(table[1,1]+table[2,2])/sum(table)

 ##knn 예측함수



##최적값 k찾기

library(e1071)

tune.out=tune.knn(x=xmat.train,y=as.factor(y_faulty.train),k=3)

tune.out

plot(tune.out)


yhat_test=knn(xmat.train,xmat.test,y_faulty.train,k=5)

table=table(real=test$y_faulty,predict=yhat_test);table

(table[1,1]+table[2,2])/sum(table)


library(Epi)

ROC(test=yhat_test,stat=test$y_faulty,plot="ROC",AUC=T,main='KNN')


new.data=data.frame(fix_time=87,a_spped=0.609,b_speed=1.715,separation=242.7,

                    s_separation=657.5,rate_terms=95,mpa=18.1,highpressure_tiem=82)

knn(xmat.train,new.data,y_faulty.train,k=5)



auto=read.csv('autoparts.csv')

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

auto2=auto1[auto1$c_thickness<1000,]

auto2$g_class=as.factor(ifelse(auto2$c_thickness<20),1,ifelse(auto2$c_thickness<32,2,3)))



auto=read.csv('autoparts.csv')

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

auto2=auto1[auto1$c_thickness<1000,]

auto2$g_class=as.factor(ifelse(auto2$c_thickness<20,1,ifelse(auto2$c_thickness<32,2,3)))


t_index=sample(1:nrow(auto2),size=nrow(auto2)*0.7)

train=auto2[t_index,]

test=auto2[-t_index,]


#예측값 생성

xmat.train=as.matrix(train[1:9])

g_class.train=train$g_class

xmat.test=as.matrix(test[1:9])


library(class)

yhat_test=knn(xmat.train,xmat.test,g_class.train,k=3)

table=table(real=test$g_class,predict=yhat_test)

(table[1,1]+table[2,2]+table[3,3])/sum(table)


#knn으로 회귀분석하기 위해서는 매트릭스화 필요

##KNN회귀분석 package'FNN'::knn.reg 이용

x.mat.train=as.matrix(train[1:9])

c_thickness.train=train$c_thickness

xmat.test=as.matrix(test[1:9])


install.packages('FNN')

library(FNN)

yhat_test=knn.reg(xmat.train,xmat.test,c_thickness.train,k=3)

mse=mean((yhat_test$pred-test$c_thickness)^2);mse



###인공신경망 

auto=read.csv('autoparts.csv')

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

auto2=auto1[auto1$c_thickness<1000,]

auto2$g_class=as.factor(ifelse(auto2$c_thickness<20,1,ifelse(auto2$c_thickness<32,2,3)))


t_index=sample(1:nrow(auto2),size=nrow(auto2)*0.7)

train=auto2[t_index,]

test=auto2[-t_index,]


install.packages('nnet')

library(nnet)


m=nnet(g_class ~ fix_time+a_speed+b_speed+separation+s_separation+rate_terms+mpa+

         load_time+highpressure_time,data=train,size=10)


#성능평가_크로스테이블_오차표

yhat_test=predict(m,test,type='class') #분류이기 때문에 타입이 class인 것.

table=table(real=test$g_class,predict=yhat_test)

table

(table[1,1]+table[2,2]+table[3,3])/sum(table)


new.data=data.frame(fix_time=c(87,85.6),a_speed=c(0.609,0.472),b_speed=c(1.715,1.685),separation=c(242.7,243.4),

                    s_separation=c(657.5,657.9),rate_terms=c(95,95),

                    load_time=c(18.1,18.2),highpressure_time=c(82,60),mpa=c(78,28.8))

predict(m,newdata=new.data,type='class')




##의사결정나무 연습문제

traintxt=read.csv('occupancy_train.csv')

library(glmnet)

xmat=as.matrix(traintxt[2:6]) #종속변수 제외하고는 매트릭스 형태로 바꿔줌/ 데이터프레임[]는 열을 의미

yvec=traintxt$Occupanc


fit.lasso=glmnet(x=xmat,y=yvec,alpha=1,nlambda = 100) #알파 : 비용(1이 제일 큼)

fit.lasso.cv=cv.glmnet(x=xmat,y=yvec,nfolds=10,alpha=1,lambda = fit.lasso$lambda) #nfold수행횟수

plot(fit.lasso.cv)


fit.lasso.param=fit.lasso.cv$lambda.min #람다 가장 작은 값

fit.lasso.param=fit.lasso.cv$lambda.1se #처음 커지는 람다 값


fit.lasso.tune=glmnet(x=xmat,y=yvec,alpha = 1,lambda=fit.lasso.param)

coef(fit.lasso.tune) #계수



#분류나무

t_index=sample(1:nrow(traintxt),size=nrow(traintxt)*0.7)

train=traintxt[t_index,]

test=traintxt[-t_index,]

m=tree(factor(Occupancy)~Temperature+Light+CO2,data=train)

plot(m)

text(m)


#예측값과 정확도

yhat_test=predict(m,test,type='class')

table=table(real=test$Occupancy,predict=yhat_test);table

(table[1,1]+table[2,2])/sum(table)


#ROC AUC

library(Epi)

ROC(test=yhat_test,stat=test$Occupancy,plot='ROC',AUC=T,main='tree')



##KNN연습문제

occall=read.csv('occupancy_all.csv',header=T)

occtrain=read.csv('occupancy_train.csv',header=T)

occtest=read.csv('occupancy_test.csv',header=T)

t_index=sample(1:nrow(occall),size=nrow(occall*0.7)

train=occtrain[t_index,]

test=occall[-t_index,]


head(train)

head(test)

xmat.train=as.matrix(occtrain[c(2,4,5)])

occ.train=occtrain$Occupancy

xmat.test=as.matrix(occtest[c(2,4,5)])

library(class)


head(xmat.train)

head(xmat.test)

yhat_test=knn(xmat.train,xmat.test,as.factor(occ.train),k=3)

table=table(real=occtest$Occupancy,predict=yhat_test);table

(table[1,1]+table[2,2])/sum(table)


#k값

library(e1071)

tune.out=tune.knn(x=xmat.train,y=as.factor(occ.train),k=1:10);tune.out

plot(tune.out)


#ROC

ROC(test=yhat_test,stat=occtest$Occupancy,plot='ROC',AUC=T,main='tree')


 



Comments