저장소

R 회귀분석 본문

교육/빅데이터 청년인재_경희대 R

R 회귀분석

HB HB 2019. 7. 25. 10:00

###회귀분석 - 추정(결과변수가 연속변수일 경우)에 활용하는 대표적인 분석방법

##단순회귀분석과 다중회귀분석,로지스틱회지분석 


##전처리

auto=read.csv('autoparts.csv')

dim(auto)

is.na(auto)

complete.cases(auto)

auto[!complete.cases(auto),]

head(auto)

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

head(auto1)


summary(auto1)

boxplot(auto1)

boxplot(auto1$separation)

auto2=auto1[auto$c_thickness<1000,]

boxplot(auto2)

hist(auto2$c_thickness,breaks = 30)



##교차검증


###모델링_단순선형회귀모형


head(women)

summary(women) #이상치확인

m=lm(weight~height,data=women) ;m

summary(m)

plot(women$height,women$weight) #산점도

abline(m,col='green')


##예측 predict

new.data=data.frame(height=75)

predict(m,newdata=new.data)


new.data=data.frame(height=c(75,76))

predict(m,newdata=new.data)


##신뢰구간 예측 interval='confidence' / 95%

predict(m,newdata=new.data,interval = 'confidence')


head(auto2)

m=lm(c_thickness~fix_time,data=auto2)

new.data=data.frame(fix_time=86.1)

predict(m,newdata=new.data)



###다중선형회귀분석

m=lm(c_thickness~fix_time+a_speed,data=auto2)

n=lm(c_thickness~.,data=auto2)

summary(m)

summary(n)



##복수 데이터 예측

new.data=data.frame(fix_time=86.1,a_speed=0.610,b_speed=1.718,separation=241.9,s_separation=657.3,

                            rate_term=95,mpa=78.2,load_time=18.1,highpressure_time=74)

predict(m,newdata=new.data)


new.data=data.frame(fix_time=c(86.1,86.1),a_speed=c(0.610,0.603),b_speed=c(1.718,1.704),separation=c(241.9,242.5),

                            rate_term=c(95,95),mpa=c(78.2,77.9),load_time=c(18.1,18.2),highpressure_time=c(74,56))

predict(m,newdata=new.data,interval = 'confidence')


test=auto2[32:50,]

new.data=data.frame(fix_time=test$fix_time,a_speed=test$a_speed,b_speed=test$b_speed,separation=test$separation,

                          s_separation=test$s_separation,rate_terms=test$rate_terms,mpa=test$mpa,load_time=test$load_time,

                          highpressure_time=test$highpressure_time)

predict(m,newdata=new.data)



###최적변수찾기 step

head(swiss)

m=lm(Fertility~.,data=swiss)

summary(m)


step(m,direction = 'forward') #전진선택법

step(m,direction = 'backward') #후진제거법

step(m,direction = 'both') #단계별선택법


m=lm(c_thickness~.,data=auto2)

step(m,direction = 'forward')

step(m,direction = 'backward')

step(m,direction = 'both')


##교차분석_sample, split

#교차분석(K-fold CV)을 위한 데이터셋 나누기

t_index=sample(1:nrow(auto2),size=nrow(auto2)) #행의 갯수만큼 변수를 만든다

split_index=split(t_index,1:10) #10개그룹으로 나눈다

class(split_index)

length(split_index)

split_index$'1'

split_index[[1]]#1의 밸류를 다 가져오기



auto=read.csv('autoparts.csv')

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

auto2=auto1[auto1$c_thickness<1000,]



mse=c()

for(i in 1:10){

  test=auto2[split_index[[i]],]

  train=auto2[-split_index[[i]],]

  m=lm(c_thickness~.,data=train)

  m_pred=predict(m,test)

  mse[i]=mean((test$c_thickness-m_pred)^2)

}  

mse

mean(mse) #mse평균제곱오차



split_index=split(t_index,1:5)

mse=c()

for(i in 1:5){

  test=auto2[split_index[[i]],]

  train=auto2[-split_index[[i]],]

  m=lm(c_thickness~.,data=train)

  m_pred=predict(m,test)

  mse[i]=mean((test$c_thickness-m_pred)^2)

}  

mse

mean(mse)


##간단한 성능평가 방법

t_index=sample(1:nrow(auto2),size=nrow(auto2)*0.7)

train=auto2[t_index,]

test=auto2[-t_index,]

nrow(train);nrow(test)


m=lm(c_thickness~.,data=train)

m_pred=predict(m,test)

mean((test$c_thickness-m_pred)^2)



##LASSO 자동 변수 선택 기법 - 독립변수 선택할 때 

xmat=as.matrix(auto2[1:9]) #종속변수 제외하고는 매트릭스 형태로 바꿔줌/ 데이터프레임[]는 열을 의미

yvec=auto2$c_thickness


library(glmnet)

install.packages('glmnet')


fit.lasso=glmnet(x=xmat,y=yvec,alpha=1,nlambda = 100) #알파 : 비용(1이 제일 큼)

fit.lasso.cv=cv.glmnet(x=xmat,y=yvec,nfolds=10,alpha=1,lambda = fit.lasso$lambda) #nfold수행횟수

plot(fit.lasso.cv)


fit.lasso.param=fit.lasso.cv$lambda.min #람다 가장 작은 값

fit.lasso.param=fit.lasso.cv$lambda.1se #처음 커지는 람다 값


fit.lasso.tune=glmnet(x=xmat,y=yvec,alpha = 1,lambda=fit.lasso.param)

coef(fit.lasso.tune) #계수



#연습문제

train=read.csv('occupancy_train.csv')

head(train)

xmat=as.matrix(train[2:6])

occ=train$Occupancy

fit.lasso=glmnet(x=xmat,y=occ,alpha = 1,nlambda = 50)

fit.lasso.cv=cv.glmnet(x=xmat,y=occ,nfolds=10,alpha=1,lambda=fit.lasso$lambda)

plot(fit.lasso.cv)


fit.lasso.param=fit.lasso.cv$lambda.min

fit.lasso.tune=glmnet(x=xmat,y=occ,alpha = 1,lambda = fit.lasso.param)

coef(fit.lasso.tune)



fit.lasso.param1=fit.lasso.cv$lambda.1se

fit.lasso.tune1=glmnet(x=xmat,y=occ,alpha = 1,lambda = fit.lasso.param1)

coef(fit.lasso.tune1)

 


Comments