我有一个如下所示的资料框:
> data
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
17 <NA> AmonGCarter0.724 NA NA AmonGCarter0.67 NA NA 155
18 <NA> AmonGCarter0.74 NA NA AmonGCarter0.629 NA NA 568
19 <NA> AmonGCarter0.8 NA NA AmonGCarter0.885 NA NA 456
20 <NA> AmonGCarter0.885 NA NA AmonGCarter0.842 NA NA 244
21 <NA> AmonGCarter0.914 NA NA AmonGCarter0.958 NA NA 36
22 <NA> AmonGCarter0.958 NA NA AmonGCarter1.035 NA NA 417
23 <NA> AmonGCarter0.976 NA NA AmonGCarter1.075 NA NA 58
24 <NA> AmonGCarter1.018 NA NA AmonGCarter0.92 NA NA 603
25 <NA> AmonGCarter1.035 NA NA AmonGCarter0.918 NA NA 194
> dput(data)
structure(list(Lake_name = c("AlanHenry", "AlanHenry", "AlanHenry",
"AlanHenry", "AlanHenry", "AlanHenry", NA, NA, NA, NA, NA, NA,
NA, NA, NA), Lake_name_percent = c("AlanHenry0.705", "AlanHenry0.82",
"AlanHenry0.82", "AlanHenry0.912", "AlanHenry0.966", "AlanHenry1.009",
"AmonGCarter0.724", "AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885",
"AmonGCarter0.914", "AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018",
"AmonGCarter1.035"), surface_area = c(2109.35, 2354.878, 2354.878,
2553.1325, 2660.238125, 2779.62076923077, NA, NA, NA, NA, NA,
NA, NA, NA, NA), percent2 = c(0.705, 0.82, 0.82, 0.912, 0.966,
1.009, NA, NA, NA, NA, NA, NA, NA, NA, NA), Lake_name_percent2 = c("AlanHenry0.708",
"AlanHenry0.821", "AlanHenry0.856", "AlanHenry0.886", "AlanHenry0.958",
"AlanHenry0.989", "AmonGCarter0.67", "AmonGCarter0.629", "AmonGCarter0.885",
"AmonGCarter0.842", "AmonGCarter0.958", "AmonGCarter1.035", "AmonGCarter1.075",
"AmonGCarter0.92", "AmonGCarter0.918"), prev.surface_area = c(2116.20333333333,
2355.95, 2426.50833333333, 2505.33642857143, 2637.47595744681,
2694.90068965517, NA, NA, NA, NA, NA, NA, NA, NA, NA), percent3 = c(0.708,
0.821, 0.856, 0.886, 0.958, 0.989, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X = c(277L, 303L, 341L, 243L, 211L, 173L, 155L, 568L,
456L, 244L, 36L, 417L, 58L, 603L, 194L)), class = "data.frame", row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L
))
我正在尝试将此资料框与如下所示的资料框合并:
surface_area Lake_name percent2 Lake_name_percent
<dbl> <chr> <dbl> <chr>
1 1159. AmonGCarter 0.724 AmonGCarter0.724
2 1176. AmonGCarter 0.74 AmonGCarter0.74
3 1240. AmonGCarter 0.8 AmonGCarter0.8
4 1329. AmonGCarter 0.885 AmonGCarter0.885
5 1360. AmonGCarter 0.914 AmonGCarter0.914
6 1407. AmonGCarter 0.958 AmonGCarter0.958
7 1426. AmonGCarter 0.976 AmonGCarter0.976
8 1468 AmonGCarter 1.02 AmonGCarter1.018
9 1484. AmonGCarter 1.03 AmonGCarter1.035
> dput(AmonGCarter.sa)
structure(list(surface_area = c(1159.1, 1175.8, 1239.65714285714,
1329.2, 1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), Lake_name = c("AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter"), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent = c("AmonGCarter0.724",
"AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914",
"AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"
)), row.names = c(NA, -9L), groups = structure(list(percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
当我像这样合并资料集时:
merge(data,AmonGCarter.sa, all.y = T)
我的资料在没有 AlanHenry 资料的情况下看起来像这样:
> merge(data,AmonGCarter.sa, all.y = T)
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AmonGCarter AmonGCarter0.724 1159.100 0.724 <NA> NA NA NA
2 AmonGCarter AmonGCarter0.74 1175.800 0.740 <NA> NA NA NA
3 AmonGCarter AmonGCarter0.8 1239.657 0.800 <NA> NA NA NA
4 AmonGCarter AmonGCarter0.885 1329.200 0.885 <NA> NA NA NA
5 AmonGCarter AmonGCarter0.914 1359.600 0.914 <NA> NA NA NA
6 AmonGCarter AmonGCarter0.958 1406.704 0.958 <NA> NA NA NA
7 AmonGCarter AmonGCarter0.976 1426.200 0.976 <NA> NA NA NA
8 AmonGCarter AmonGCarter1.018 1468.000 1.018 <NA> NA NA NA
9 AmonGCarter AmonGCarter1.035 1484.200 1.035 <NA> NA NA NA
> dput(merge(data,AmonGCarter.sa, all.y = T))
structure(list(Lake_name = c("AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter"), Lake_name_percent = c("AmonGCarter0.724", "AmonGCarter0.74",
"AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914", "AmonGCarter0.958",
"AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"),
surface_area = c(1159.1, 1175.8, 1239.65714285714, 1329.2,
1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent2 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), prev.surface_area = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), percent3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), X = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), row.names = c(NA, -9L), class = "data.frame")
我如何将资料添加AmonGCarter
到合并的资料框中,以保留 AlanHenry 的先前资料?
uj5u.com热心网友回复:
您可以full_join
在tidyverse
. 这会创建额外的列(但我们用后缀控制名称),但随后我们可以使用coalesce
组合类似的列。然后,我们可以洗掉额外的列。
library(tidyverse)
data %>%
full_join(AmonGCarter.sa, by = 'Lake_name_percent', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~ coalesce(., get(
sub('_x', '_y', cur_column())
)),
.names = '{sub("_x", "", {.col})}')) %>%
select(!ends_with('_x') & !ends_with('_y')) %>%
select(names(data))
另一种简单的方法是使用natural_join
fromrquery
包。它将取代NA
在值data
与值AmonGCarter.sa
。然后,要按原始顺序获取列data
,您可以使用select
from dplyr
。
library(rquery)
library(rqdatatable)
library(dplyr)
rquery::natural_join(data, AmonGCarter.sa, by = "Lake_name_percent", jointype = "FULL") %>%
dplyr::select(names(data))
输出
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
7 AmonGCarter AmonGCarter0.724 1159.100 0.724 AmonGCarter0.67 NA NA 155
8 AmonGCarter AmonGCarter0.74 1175.800 0.740 AmonGCarter0.629 NA NA 568
9 AmonGCarter AmonGCarter0.8 1239.657 0.800 AmonGCarter0.885 NA NA 456
10 AmonGCarter AmonGCarter0.885 1329.200 0.885 AmonGCarter0.842 NA NA 244
11 AmonGCarter AmonGCarter0.914 1359.600 0.914 AmonGCarter0.958 NA NA 36
12 AmonGCarter AmonGCarter0.958 1406.704 0.958 AmonGCarter1.035 NA NA 417
13 AmonGCarter AmonGCarter0.976 1426.200 0.976 AmonGCarter1.075 NA NA 58
14 AmonGCarter AmonGCarter1.018 1468.000 1.018 AmonGCarter0.92 NA NA 603
15 AmonGCarter AmonGCarter1.035 1484.200 1.035 AmonGCarter0.918 NA NA 194
0 评论