R 회귀분석

Notice

Recent Comments

Link

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

Tags more

Today

Total

관리 메뉴

저장소

R 회귀분석 본문

교육/빅데이터 청년인재_경희대 R

R 회귀분석

HB HB 2019. 7. 25. 10:00

###회귀분석 - 추정(결과변수가 연속변수일 경우)에 활용하는 대표적인 분석방법

##단순회귀분석과 다중회귀분석,로지스틱회지분석

##전처리

auto=read.csv('autoparts.csv')

dim(auto)

is.na(auto)

complete.cases(auto)

auto[!complete.cases(auto),]

head(auto)

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

head(auto1)

summary(auto1)

boxplot(auto1)

boxplot(auto1$separation)

auto2=auto1[auto$c_thickness<1000,]

boxplot(auto2)

hist(auto2$c_thickness,breaks = 30)

##교차검증

###모델링_단순선형회귀모형

head(women)

summary(women) #이상치확인

m=lm(weight~height,data=women) ;m

summary(m)

plot(women$height,women$weight) #산점도

abline(m,col='green')

##예측 predict

new.data=data.frame(height=75)

predict(m,newdata=new.data)

new.data=data.frame(height=c(75,76))

predict(m,newdata=new.data)

##신뢰구간 예측 interval='confidence' / 95%

predict(m,newdata=new.data,interval = 'confidence')

head(auto2)

m=lm(c_thickness~fix_time,data=auto2)

new.data=data.frame(fix_time=86.1)

predict(m,newdata=new.data)

###다중선형회귀분석

m=lm(c_thickness~fix_time+a_speed,data=auto2)

n=lm(c_thickness~.,data=auto2)

summary(m)

summary(n)

##복수 데이터 예측

new.data=data.frame(fix_time=86.1,a_speed=0.610,b_speed=1.718,separation=241.9,s_separation=657.3,

rate_term=95,mpa=78.2,load_time=18.1,highpressure_time=74)

predict(m,newdata=new.data)

new.data=data.frame(fix_time=c(86.1,86.1),a_speed=c(0.610,0.603),b_speed=c(1.718,1.704),separation=c(241.9,242.5),

rate_term=c(95,95),mpa=c(78.2,77.9),load_time=c(18.1,18.2),highpressure_time=c(74,56))

predict(m,newdata=new.data,interval = 'confidence')

test=auto2[32:50,]

new.data=data.frame(fix_time=test$fix_time,a_speed=test$a_speed,b_speed=test$b_speed,separation=test$separation,

s_separation=test$s_separation,rate_terms=test$rate_terms,mpa=test$mpa,load_time=test$load_time,

highpressure_time=test$highpressure_time)

predict(m,newdata=new.data)

###최적변수찾기 step

head(swiss)

m=lm(Fertility~.,data=swiss)

summary(m)

step(m,direction = 'forward') #전진선택법

step(m,direction = 'backward') #후진제거법

step(m,direction = 'both') #단계별선택법

m=lm(c_thickness~.,data=auto2)

step(m,direction = 'forward')

step(m,direction = 'backward')

step(m,direction = 'both')

##교차분석_sample, split

#교차분석(K-fold CV)을 위한 데이터셋 나누기

t_index=sample(1:nrow(auto2),size=nrow(auto2)) #행의 갯수만큼 변수를 만든다

split_index=split(t_index,1:10) #10개그룹으로 나눈다

class(split_index)

length(split_index)

split_index$'1'

split_index[[1]]#1의 밸류를 다 가져오기

auto=read.csv('autoparts.csv')

auto1=auto[auto$prod_no=='90784-76001',c(2:11)]

auto2=auto1[auto1$c_thickness<1000,]

mse=c()

for(i in 1:10){

test=auto2[split_index[[i]],]

train=auto2[-split_index[[i]],]

m=lm(c_thickness~.,data=train)

m_pred=predict(m,test)

mse[i]=mean((test$c_thickness-m_pred)^2)

}

mse

mean(mse) #mse평균제곱오차

split_index=split(t_index,1:5)

mse=c()

for(i in 1:5){

test=auto2[split_index[[i]],]

train=auto2[-split_index[[i]],]

m=lm(c_thickness~.,data=train)

m_pred=predict(m,test)

mse[i]=mean((test$c_thickness-m_pred)^2)

}

mse

mean(mse)

##간단한 성능평가 방법

t_index=sample(1:nrow(auto2),size=nrow(auto2)*0.7)

train=auto2[t_index,]

test=auto2[-t_index,]

nrow(train);nrow(test)

m=lm(c_thickness~.,data=train)

m_pred=predict(m,test)

mean((test$c_thickness-m_pred)^2)

##LASSO 자동 변수 선택 기법 - 독립변수 선택할 때

xmat=as.matrix(auto2[1:9]) #종속변수 제외하고는 매트릭스 형태로 바꿔줌/ 데이터프레임[]는 열을 의미

yvec=auto2$c_thickness

library(glmnet)

install.packages('glmnet')

fit.lasso=glmnet(x=xmat,y=yvec,alpha=1,nlambda = 100) #알파 : 비용(1이 제일 큼)

fit.lasso.cv=cv.glmnet(x=xmat,y=yvec,nfolds=10,alpha=1,lambda = fit.lasso$lambda) #nfold수행횟수

plot(fit.lasso.cv)

fit.lasso.param=fit.lasso.cv$lambda.min #람다 가장 작은 값

fit.lasso.param=fit.lasso.cv$lambda.1se #처음 커지는 람다 값

fit.lasso.tune=glmnet(x=xmat,y=yvec,alpha = 1,lambda=fit.lasso.param)

coef(fit.lasso.tune) #계수

#연습문제

train=read.csv('occupancy_train.csv')

head(train)

xmat=as.matrix(train[2:6])

occ=train$Occupancy

fit.lasso=glmnet(x=xmat,y=occ,alpha = 1,nlambda = 50)

fit.lasso.cv=cv.glmnet(x=xmat,y=occ,nfolds=10,alpha=1,lambda=fit.lasso$lambda)

plot(fit.lasso.cv)

fit.lasso.param=fit.lasso.cv$lambda.min

fit.lasso.tune=glmnet(x=xmat,y=occ,alpha = 1,lambda = fit.lasso.param)

coef(fit.lasso.tune)

fit.lasso.param1=fit.lasso.cv$lambda.1se

fit.lasso.tune1=glmnet(x=xmat,y=occ,alpha = 1,lambda = fit.lasso.param1)

coef(fit.lasso.tune1)

'교육 > 빅데이터 청년인재_경희대 R' 카테고리의 다른 글

R 의사결정나무, 인공신경망 (0)	2019.07.26
R 지도학습 알고리즘_로지스틱 (0)	2019.07.25
R 기본 통계 (0)	2019.07.24
R 정형 데이터 분석_전처리(dplyr)03 (0)	2019.07.15
R 정형 데이터 분석_전처리02 (1)	2019.07.14

'교육/빅데이터 청년인재_경희대 R' Related Articles

Comments

저장소

R 회귀분석 본문

R 회귀분석

'교육 > 빅데이터 청년인재_경희대 R' 카테고리의 다른 글

티스토리툴바