基于R中交替字符的分割string

我试图找出一个有效的方式去分裂像一个string

"111110000011110000111000" 

成vector

 [1] "11111" "00000" "1111" "0000" "111" "000" 

其中“0”和“1”可以是任何交替字符。

尝试

 strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000" 

更新

使用stri_extract_all_regex修改@ rawr的解决scheme

 library(stringi) stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000" stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" #[10] "000" stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]] #[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" #[8] "00000" "222" "aaa" "bb" "cc" "d" "11" #[15] "D" "aa" "BB" 

基准

 library(stringi) set.seed(24) x3 <- stri_rand_strings(1, 1e4) akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]] #modified @thelatemail's function to make it bit more general thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, perl=TRUE))[[1]] rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]] ananda <- function() unlist(read.fwf(textConnection(x3), rle(strsplit(x3, "")[[1]])$lengths, colClasses = "character")) Colonel <- function() with(rle(strsplit(x3,'')[[1]]), mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values)) Cryo <- function(){ res_vector=rep(NA_character_,nchar(x3)) res_vector[1]=substr(x3,1,1) counter=1 old_tmp='' for (i in 2:nchar(x3)) { tmp=substr(x3,i,i) if (tmp==old_tmp) { res_vector[counter]=paste0(res_vector[counter],tmp) } else { res_vector[counter+1]=tmp counter=counter+1 } old_tmp=tmp } res_vector[!is.na(res_vector)] } richard <- function(){ cs <- cumsum( rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths ) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } nicola<-function(x) { indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x)) substring(x,indices[-length(indices)]+1,indices[-1]) } richard2 <- function() { cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]]) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } system.time(akrun()) # user system elapsed # 0.003 0.000 0.003 system.time(thelate()) # user system elapsed # 0.272 0.001 0.274 system.time(rawr()) # user system elapsed # 0.397 0.001 0.398 system.time(ananda()) # user system elapsed # 3.744 0.204 3.949 system.time(Colonel()) # user system elapsed # 0.154 0.001 0.154 system.time(Cryo()) # user system elapsed # 0.220 0.005 0.226 system.time(richard()) # user system elapsed # 0.007 0.000 0.006 system.time(nicola(x3)) # user system elapsed # 0.190 0.001 0.191 

在一个稍大的string,

 set.seed(24) x3 <- stri_rand_strings(1, 1e6) system.time(akrun()) #user system elapsed #0.166 0.000 0.155 system.time(richard()) # user system elapsed # 0.606 0.000 0.569 system.time(richard2()) # user system elapsed # 0.518 0.000 0.487 system.time(Colonel()) # user system elapsed # 9.631 0.000 9.358 library(microbenchmark) microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative') #Unit: relative # expr min lq mean median uq max neval cld # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581 20 b #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861 20 b # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a 

注意:尝试运行其他方法,但需要很长时间

数据

 str1 <- "111110000011110000111000" x1 <- "1111100000222000333300011110000111000" x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB" 

主题的变化:

 x <- "111110000011110000111000" regmatches(x,gregexpr("1+|0+",x))[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000" 

您可能可以使用substrread.fwf以及rle (尽pipe它不像任何基于正则expression式的解决scheme那样高效):

 x <- "111110000011110000111000" unlist(read.fwf(textConnection(x), rle(strsplit(x, "")[[1]])$lengths, colClasses = "character")) # V1 V2 V3 V4 V5 V6 # "11111" "00000" "1111" "0000" "111" "000" 

这种方法的一个优点是,它甚至可以说:

 x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7), rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "") x # [1] "aaaaabbcccccccbbbad" unlist(read.fwf(textConnection(x), rle(strsplit(x, "")[[1]])$lengths, colClasses = "character")) # V1 V2 V3 V4 V5 V6 # "aaaaa" "bb" "ccccccc" "bbb" "a" "d" 

另一种方法是在交替数字之间添加空格。 这将适用于任何两个,不只是1和0。 然后在strsplit使用strsplit

 x <- "111110000011110000111000" (y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE)) # [1] "11111 00000 1111 0000 111 000 " strsplit(y, ' ')[[1]] # [1] "11111" "00000" "1111" "0000" "111" "000" 

或者@akrun更简洁地指出:

 strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]] # [1] "11111" "00000" "1111" "0000" "111" "000" 

也改变\\d \\w也工作

 x <- "aaaaabbcccccccbbbad" strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]] # [1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" x <- "111110000011110000111000" strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]] # [1] "11111" "00000" "1111" "0000" "111" "000" 

你也可以使用\K (而不是明确地使用捕获组, \\1\\2 ),我没有看到这么多,也不知道如何解释它。}

AFAIK \\K重新设置报告匹配的起始点,任何以前消耗的字符不再包括在内,基本上丢掉了与该点匹配的所有内容。

 x <- "1111100000222000333300011110000111000" (z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE)) # [1] "11111 00000 222 000 3333 000 1111 0000 111 000 " 

原始方法:这是一个包含rle rle()stringi方法。

 x <- "111110000011110000111000" library(stringi) cs <- cumsum( rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths ) stri_sub(x, c(1L, head(cs + 1L, -1L)), cs) # [1] "11111" "00000" "1111" "0000" "111" "000" 

或者,您可以在stri_sub()使用length参数

 rl <- rle(stri_split_boundaries(x, type = "character")[[1L]]) with(rl, { stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths) }) # [1] "11111" "00000" "1111" "0000" "111" "000" 

效率更新:意识到base::strsplit()stringi::stri_split_boundaries()更快,这里是我以前的答案更有效的版本,只使用基函数。

 set.seed(24) x3 <- stri_rand_strings(1L, 1e6L) system.time({ cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]]) substring(x3, c(1L, head(cs + 1L, -1L)), cs) }) # user system elapsed # 0.686 0.012 0.697 

另一种情况,使用mapply

 x="111110000011110000111000" with(rle(strsplit(x,'')[[1]]), mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values)) #[1] "11111" "00000" "1111" "0000" "111" "000" 

这实际上并不是OP所要查找的(简明的R代码),但是我认为我会试试Rcpp ,结果相对简单,比基于R的最快答案快5倍。

 library(Rcpp) cppFunction( 'std::vector<std::string> split_str_cpp(std::string x) { std::vector<std::string> parts; int start = 0; for(int i = 1; i <= x.length(); i++) { if(x[i] != x[i-1]) { parts.push_back(x.substr(start, i-start)); start = i; } } return parts; }') 

并在这些testing

 str1 <- "111110000011110000111000" x1 <- "1111100000222000333300011110000111000" x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB" 

给出以下输出

 > split_str_cpp(str1) [1] "11111" "00000" "1111" "0000" "111" "000" > split_str_cpp(x1) [1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" "000" > split_str_cpp(x2) [1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11" [15] "D" "aa" "BB" 

基准testing显示它比R解决scheme快5-10倍。

 akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]] richard1 <- function(x3){ cs <- cumsum( rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths ) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } richard2 <- function(x3) { cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]]) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } library(microbenchmark) library(stringi) set.seed(24) x3 <- stri_rand_strings(1, 1e6) microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L) 

比较:

 Unit: relative expr min lq mean median uq max neval split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134 20 richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446 20 richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680 20 

简单for循环解决scheme

 x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB" res_vector=substr(x,1,1) for (i in 2:nchar(x)) { tmp=substr(x,i,i) if (tmp==substr(x,i-1,i-1)) { res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp) } else { res_vector[length(res_vector)+1]=tmp } } res_vector #[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11" "D" "aa" "BB" 

或者预先分配的结果向量也许更快一点

 x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB" res_vector=rep(NA_character_,nchar(x)) res_vector[1]=substr(x,1,1) counter=1 old_tmp='' for (i in 2:nchar(x)) { tmp=substr(x,i,i) if (tmp==old_tmp) { res_vector[counter]=paste0(res_vector[counter],tmp) } else { res_vector[counter+1]=tmp counter=counter+1 } old_tmp=tmp } res_vector[!is.na(res_vector)] 

这个怎么样:

 s <- "111110000011110000111000" spl <- strsplit(s,"10|01")[[1]] l <- length(spl) sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2))) # [1] "11111" "00000" "1111" "0000" "111" "000"