R 비정형데이터_기본함수, stringr

Notice

Recent Comments

Link

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

Tags more

Today

Total

관리 메뉴

저장소

R 비정형데이터_기본함수, stringr 본문

교육/빅데이터 청년인재_경희대 R

R 비정형데이터_기본함수, stringr

HB HB 2019. 7. 29. 18:23

###텍스트마이닝 기본 함수

letters[3]

LETTERS[3]

letters

letters[1:12]

LETTERS[2:15]

##소문자 대문자 전환

tolower('KOREA')

toupper('korea')

##바이트 세주는 함스 nchar

nchar('korea')

length('korea')

nchar('한국')

length('한국')

nchar('한국',type = 'bytes')

##substr 문자열 추출(중요!!)

substr('BigDataAnalysis', start=4, stop=7)

substr('BigDataAnalysis', 4,7)

country <- c('korea','japan','china','singapore','russia') ;substr(country,start=1,stop=3)

##stsplit 문장분리

myword <- 'This is the Big Data Analysis'

x<-strsplit(myword,split=' ')

mysen <- strsplit(myword,split=' ')

strsplit(mysen[[1]][6],split='') #쪼개면 리스트 형태로 분리되기 때문에 앞에는 [[]]로 써야 함

myword1 <- 'This is the Big Data Analysis 1'

myword2 <- 'This is the Big Data Analysis 2'

myword3 <- 'This is the Big Data Analysis 3'

myword <- c(myword1, myword2, myword3)

for (i in myword) {

result<-strsplit(myword,split=' ')

}

result

myword <- '우리의 소원은 통일입니다. 꿈에도 소원은 통일입니다.'

strsplit(myword,split='\\.')

##paste 문자열 붙여주는 함수

number <-1:10

alphabet <- c('a','b','c')

paste(number,alphabet) #두개 인자 붙일 때

paste(number,alphabet,sep="")

paste(number,alphabet,sep="-")

#collapse 공백제거 한 인자 붙일 때 사용하는 옵션

paste(number,collapse = '')

paste(alphabet,collapse = '')

for (i in 1:length(result)){print(paste(result[[i]],collapse = ' '))}

##regexpr 텍스트 (첫번째)위치 보고

x <- regexpr('입니다',myword)

#substr도 원하는 부분 추출 가능_원하는 부분 알 고 있음 좋음

substr(myword,x[1],x[1]+attr(x,'match.length')-1)

substr(myword,x[1],x[1]+attr(x,'match.length'))

##gregexpr 텍스트 전체에서 찾기

myword

x <- gregexpr('입니다',myword)

x[[1]]

attr(x[[1]],'match.length')

myword <- c('소원은 통일입니다.사랑입니다.','꿈엔 통일입니다. 그러한 것입니다.')

x <- gregexpr('입니다',myword)

x[[1]]

x[[2]]

attr(x[[1]],'match.length')

##grep 원소 번호 알려줌 / value=T 해당 문장 전체 출력

myword <-c('우리의 소원은 통일입니다.','꿈에도 소원은 통일입니다.')

grep('입니다',myword)

grep('우리', myword)

grep('우리', myword, value=T) #해당문장 전체 출력

##grepl 여부를 알려줌 /value 사용 불가

myword

grepl('입니다',myword)

grepl('꿈',myword)

##sub 지정된 표현 다른 지정된 표현으로 변경(오타 유용)

##gsub 모든 지정표현 다른 지정표현으로 변경, 공백 : 삭제표현 기억해두기!!

myword <- c('소원은 통일입니다.사랑입니다.','꿈엔 통일입니다. 그러한 것입니다.')

sub('입니다','일까요',myword)

gsub('입니다','이삼',myword)

gsub('통일',' ',myword)

##regmatch 지정된 표현 추출, regexpr,gregexpr과 함께 쓰임

myword

mypattern <-gregexpr('입니다',myword)

regmatches(myword,mypattern)

###정규표현식 _ gregexpr regmatches와 함께 쓰임

#정규식 표현 분해 사이트 http://regexper.com/ (참고)

myword <- c('1번째는 통일입니다.','2번째는 우정입니다.3번째는 같이')

mypattern <- gregexpr('[[:digit:]]',myword) #[[:digit:]] 숫자

regmatches(myword,mypattern)

myword <- c('1번째는 통일입니다.','2번째는 가족입니다.')

mypattern <- gregexpr('[[:digit:]](번째)',myword) #특정 표현을 앞뒤에 둘 수 있다

regmatches(myword,mypattern)

myword <- c('제1번째는 통일입니다.','제2번째는 가족입니다.')

mypattern <- gregexpr('(제)[[:digit:]](번째)',myword)

regmatches(myword,mypattern)

##N회이상의 문자 조건

myword <- c('1번째는 통일입니다.','2번째는 가족입니다.')

mypattern <- gregexpr('[[:alpha:]]+(니다)',myword) #[[:alpha:]]문자 #+ : 1회 혹은 그 이상

regmatches(myword,mypattern)

mypattern<-gregexpr('[[:alpha:]]{1,}(니다)',myword) #{n,}:n회 이상 / {,n}:n회 미만

regmatches(myword,mypattern)

##제한조건

myword <- c('1번째는 통일입니다.','2번째는 가족입니다.','3번째는 옵니다만')

mypattern <-gregexpr('[[:alpha:]]{1,}(니다)',myword) #이건 제한조건X

regmatches(myword,mypattern)

#제한조건

mypattern <-gregexpr('[[:alpha:]]{1,}(니다)\\b',myword) #\\b : 앞의 표현으로 종결되는 것만

regmatches(myword,mypattern)

##빈도조사 table(unlist()) / 리스트 형식으로 저장되기 때문

myword <- c('1번째는 통일입니다.','2번째는 가족입니다.','3번째도 통일입니다.')

mypattern <- gregexpr('[[:alpha:]]{1,}(니다)',myword)

x <-regmatches(myword,mypattern)

table(unlist(x))

##주어진 조건 골라내기 / grep원소 번호 알려주는 함수

grep_ex <- c('a.txt','A.txt','ab.txt','123.txt','ba123.txt')

grep('^a',grep_ex) # ^ : 시작되는 글자 지정

grep('^a',grep_ex,value=T) #vlaue=T 전체 다 불러오기

grep('^[^0-9]',grep_ex,value=T) #[대괄호 안은 제외하고 라는 뜻] [^0-9]:문자데이터만 추출해라

##연습문제

reg_ex <- read.table('reg_ex.txt',sep='\t')

str(reg_ex)

head(reg_ex$V1)

reg_1 <- grep('^ORA',reg_ex$V1,value=T)

reg_2 <- gregexpr('(^ORA-)[[:digit:]]{5}',reg_1)

reg_2 <-regmatches(reg_1,reg_2)

reg_2 <-unlist(reg_2)

head(reg_2,3)

View(reg_2)

sum(table(grep('ORA',reg_ex$V1)))

#정답

txt <- readLines('reg_ex.txt')

reg_1 <- grep('^ORA-',txt,value=T)

head(reg_1,3)

reg_2 <- substr(reg_1, start=1, stop =9) #substr 문자열 추출

head(reg_2,3)

reg_3 <- gregexpr('ORA',txt)

x<- regmatches(txt,reg_3)

table(unlist(x))

###stringr 라이브러리 함수

install.packages('stringr')

library(stringr)

myword <- c('1번째는 통일입니다.','2번째는 가족입니다.')

str_extract_all(myword,'입니다') #추출

##str_extract(text,key) 지정된 표현 추출 / _all 전체 / simplify=T 행렬로 출력

myword <- c('1번째는 통일입니다. 1-2번째는 사랑입니다.','2번째는 가족입니다')

str_extract_all(myword,'입니다',simplify=T)

myword #정규표현식 이용

str_extract_all(myword,'[[:alpha:]]{1,}(니다)',simplify=T) #행렬

str_extract_all(myword,'[[:alpha:]]{1,}(니다)') #리스트

##str_locate(txt,key) 지정된 위치 추출 / regexpr, gregexpr과 유사

myword

str_locate(myword,'입니다')

str_locate_all(myword,'입니다')

##str_detect 포함되어있는지 확인하는 함수 / all 없음

myword

str_detect(myword,'입니다')

##str_replace(txt,key_old,key_new) 문자 변경

myword

str_replace(myword,'입니다','일까요')

str_replace_all(myword,'입니다','일까요')

##str_split(txt,key) 지정된 표현으로 분할

myword

str_split(myword,'\\.') #\\. : .으로 끝나는

##str_count 몇 회 나타났는지

myword

str_count(myword,'입니다')

##str_sub(txt,start,stop) !!!많이쓰임!!! == gsub

myword

str_sub(myword,1,4)

##str_length 문자의 수 계산

myword

str_length(myword)

##str_c(txt,sep:짝맞춰서) (txt,collapse:덩어리로) 벡터 연결 ==paste

number <- 1:3

alphabet <- c('a','b','c')

str_c(number,alphabet,sep='')

str_c(number,alphabet, collapse = '')

'교육 > 빅데이터 청년인재_경희대 R' 카테고리의 다른 글

R 앙상블_시계열, 배깅, 부스팅, 랜덤포레스트 (0)	2019.07.28
R 무방향성 데이터분석_연관규칙 (0)	2019.07.27
R 무방향성 데이터분석_상관분석 (0)	2019.07.26
R 의사결정나무, 인공신경망 (0)	2019.07.26
R 지도학습 알고리즘_로지스틱 (0)	2019.07.25

'교육/빅데이터 청년인재_경희대 R' Related Articles

Comments

저장소

R 비정형데이터_기본함수, stringr 본문

R 비정형데이터_기본함수, stringr

'교육 > 빅데이터 청년인재_경희대 R' 카테고리의 다른 글

티스토리툴바