我正在尝试使用要提取的两个重复数据部分导入以下文件。第一组以未使用的标题(第 5 行)开始,而真正的标题以“ES”第 5 行开头)。下一部分数据以未使用的标题(第 13 行)和以“LU”开头的真实标题(第 14 行)和更多变量名称开头。有很多这样的文件,每个文件都有不同数量的不同长度的 EU 和 LS 部分。我需要提取 LS 和 EU 数据以分离数据帧。不幸的是,这些文件“按原样”脱离了传感器阵列,我无法更改它,并且不想在 excel 中执行所有这些操作,但可能必须这样做。在实际文件中,每个 EU 和 LS 集可能有数百个这样的行。
我试图调整以下代码以索引 EU 部分,然后将其提取并清理它,然后在 LS 部分上做同样的事情,但我什至没有让它工作。部分原因是欧盟在两个标题行中。我确实看到过使用 perl 脚本的代码,但从未使用过这种语言。
lns = readLines("lake1.txt")
idx = grepl("EU", lns)
df = read.table(text=lns[!idx])
wd = diff(c(which(idx), length(idx) + 1)) - 1
df$label = rep(lns[idx], wd)
我不确定添加 CSV 文件示例的最佳方法,但这里是...
Garbage Text 1,,,,,,,,
Garbage Text 2,,,,,,,,
Garbage Text 3,,,,,,,,
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),ES_DARK ('uW/cm^2/nm'),ES_DARK ('uW/cm^2/nm'),ES_DARK ('uW/cm^2/nm'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
ES,DELAY,344.83,348.23,351.62,SUM,NONE,NONE,COUNTS
0.032,0,0.35441789,-0.00060208,0.10290995,87,2017015,10:42:39,1
0.032,0,-0.36023974,-0.22242269,-0.09639,109,2017015,10:42:40,10
0.032,0,0.07552711,0.01524224,-0.16756855,91,2017015,10:42:48,41
,,,,,,,,11304
,,,,,,,,11312
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
LU,DELAY,344.37,347.75,351.13,SUM,NONE,NONE,COUNTS
0.032,0,0.02288441,0.02891912,0.03595322,53,2017015,10:42:38,2
0.032,0,-0.00014323,0.00024047,0.00001585,212,2017015,10:42:38,6
0.032,0,0.00114258,0.00091736,-0.0000495,16,2017015,10:42:39,9
0.032,0,0.00020744,0.0004186,0.00027721,118,2017015,10:42:40,16
,,,,,,,,11310
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),ES ('uW/cm^2/nm'),ES ('uW/cm^2/nm'),ES ('uW/cm^2/nm'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
ES,DELAY,344.83,348.23,351.62,SUM,NONE,NONE,COUNTS
0.032,0,56.7600789,59.43147464,62.83968564,186,2017015,10:42:38,3
0.032,0,56.27202003,59.52654061,62.86815706,29,2017015,10:42:38,4
,,,,,,,,11309
,,,,,,,,11311
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
LU,DELAY,344.37,347.75,351.13,SUM,NONE,NONE,COUNTS
0.032,0,-0.00011611,-0.00039544,-0.00014584,3,2017015,10:42:42,20
0.032,0,-0.00032394,-0.00020563,-0.00020383,229,2017015,10:42:46,39
这就是这两个数据帧最终的样子:
数据框 1
ES,DELAY,344.83,348.23,351.62,SUM,NONE,NONE,COUNTS
0.032,0,0.35441789,-0.00060208,0.10290995,87,2017015,10:42:39,1
0.032,0,-0.36023974,-0.22242269,-0.09639,109,2017015,10:42:40,10
0.032,0,0.07552711,0.01524224,-0.16756855,91,2017015,10:42:48,41
0.032,0,56.7600789,59.43147464,62.83968564,186,2017015,10:42:38,3
0.032,0,56.27202003,59.52654061,62.86815706,29,2017015,10:42:38,4
数据框 2
LU,DELAY,344.37,347.75,351.13,SUM,NONE,NONE,COUNTS
0.032,0,0.02288441,0.02891912,0.03595322,53,2017015,10:42:38,2
0.032,0,-0.00014323,0.00024047,0.00001585,212,2017015,10:42:38,6
0.032,0,0.00114258,0.00091736,-0.0000495,16,2017015,10:42:39,9
0.032,0,0.00020744,0.0004186,0.00027721,118,2017015,10:42:40,16
0.032,0,-0.00011611,-0.00039544,-0.00014584,3,2017015,10:42:42,20
0.032,0,-0.00032394,-0.00020563,-0.00020383,229,2017015,10:42:46,39
这是您可以使用tidyverse
工具解决此问题的一种方法。
readr
用于读取/写入 csv 文件
dplyr
用于数据框操作
stringr
用于字符串操作
library(readr)
library(dplyr)
library(stringr)
df_1 <- read_csv("test1.csv", col_names = FALSE, col_types = cols(.default = "c"), skip = 3)
首先删除所有值丢失的行,或除最后一个之外的所有行,以及带有额外标题的行。
然后使用ES
或LU
值创建一个新列,否则NA
,然后使用tidyr::fill
填充这些值。
然后用NONE
to DATE
and更改两列,TIME
因为稍后我们不希望两列具有相同的名称。
df_2 <- df_1 %>%
filter(!is.na(X1), !str_detect(X1, "INTTIME")) %>%
mutate(grp = if_else(X1 %in% c("ES", "LU"), X1, NA_character_)) %>%
tidyr::fill(grp, .direction = "down") %>%
mutate(X7 = str_replace(X7, "NONE", "DATE"),
X8 = str_replace(X8, "NONE", "TIME"))
df_2
#> # A tibble: 15 x 10
#> X1 X2 X3 X4 X5 X6 X7 X8 X9 grp
#> * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 ES DELAY 344.83 348.23 351.62 SUM DATE TIME COUNTS ES
#> 2 0.032 0 0.35441789 -0.00060208 0.10290995 87 2017015 10:42:39 1 ES
#> 3 0.032 0 -0.36023974 -0.22242269 -0.09639 109 2017015 10:42:40 10 ES
#> 4 0.032 0 0.07552711 0.01524224 -0.16756855 91 2017015 10:42:48 41 ES
#> 5 LU DELAY 344.37 347.75 351.13 SUM DATE TIME COUNTS LU
#> 6 0.032 0 0.02288441 0.02891912 0.03595322 53 2017015 10:42:38 2 LU
#> 7 0.032 0 -0.00014323 0.00024047 0.00001585 212 2017015 10:42:38 6 LU
#> 8 0.032 0 0.00114258 0.00091736 -0.0000495 16 2017015 10:42:39 9 LU
#> 9 0.032 0 0.00020744 0.0004186 0.00027721 118 2017015 10:42:40 16 LU
#> 10 ES DELAY 344.83 348.23 351.62 SUM DATE TIME COUNTS ES
#> 11 0.032 0 56.7600789 59.43147464 62.83968564 186 2017015 10:42:38 3 ES
#> 12 0.032 0 56.27202003 59.52654061 62.86815706 29 2017015 10:42:38 4 ES
#> 13 LU DELAY 344.37 347.75 351.13 SUM DATE TIME COUNTS LU
#> 14 0.032 0 -0.00011611 -0.00039544 -0.00014584 3 2017015 10:42:42 20 LU
#> 15 0.032 0 -0.00032394 -0.00020563 -0.00020383 229 2017015 10:42:46 39 LU
现在对于每个ES
和LU
您可以过滤到那些记录,然后删除新grp
列,然后使用第一行作为列名,然后删除那些列标题行,并写入一个新的清理过的 csv 文件。
df_es <- df_2 %>%
filter(grp == "ES") %>%
select(-grp) %>%
purrr::set_names(., .[1,]) %>%
filter(ES != "ES") %>%
write_csv("ES.csv")
df_es
#> # A tibble: 5 x 9
#> ES DELAY `344.83` `348.23` `351.62` SUM DATE TIME COUNTS
#> * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 0.032 0 0.35441789 -0.00060208 0.10290995 87 2017015 10:42:39 1
#> 2 0.032 0 -0.36023974 -0.22242269 -0.09639 109 2017015 10:42:40 10
#> 3 0.032 0 0.07552711 0.01524224 -0.16756855 91 2017015 10:42:48 41
#> 4 0.032 0 56.7600789 59.43147464 62.83968564 186 2017015 10:42:38 3
#> 5 0.032 0 56.27202003 59.52654061 62.86815706 29 2017015 10:42:38 4
df_lu <- df_2 %>%
filter(grp == "LU") %>%
select(-grp) %>%
set_names(., .[1,]) %>%
filter(LU != "LU") %>%
write_csv("LU.csv")
df_lu
#> # A tibble: 6 x 9
#> LU DELAY `344.37` `347.75` `351.13` SUM DATE TIME COUNTS
#> * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 0.032 0 0.02288441 0.02891912 0.03595322 53 2017015 10:42:38 2
#> 2 0.032 0 -0.00014323 0.00024047 0.00001585 212 2017015 10:42:38 6
#> 3 0.032 0 0.00114258 0.00091736 -0.0000495 16 2017015 10:42:39 9
#> 4 0.032 0 0.00020744 0.0004186 0.00027721 118 2017015 10:42:40 16
#> 5 0.032 0 -0.00011611 -0.00039544 -0.00014584 3 2017015 10:42:42 20
#> 6 0.032 0 -0.00032394 -0.00020563 -0.00020383 229 2017015 10:42:46 39
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句