合并数据集
很多情况下,数据集都不会只有一个文件,但是为了方便后续的处理和分析,在预处理的阶段就会将各个数据集进行合并。
在这里插入图片描述

1. Inner Join内连接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 数据集A
A = data.frame(id=c(1,3,5), A_val=c('a','x','c'))
id A_val
1 1 a
2 3 x
3 5 c
# 数据集B
B = data.frame(id=c(3,5,6), B_val=c('x','y','z'))
id B_val
1 3 x
2 5 y
3 6 z
# 函数merge( )通过“id”取交集,所以只剩下 id为3 5的
A_B = merge(x=A, y=B, by.x='id', by.y='id') A_B
id A_val B_val
1 3 x x
2 5 c y

2. Out Join 外连接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 通过“id”将B外接导A上
left_A_B = merge(x=A, y=B, by.x='id', by.y='id', all.x=T)
left_A_B # A left join B
id A_val B_val
1 1 a <NA>
2 3 x x
3 5 c y
# 通过“id”将A外接导B上
right_A_B = merge(x=A, y=B, by.x='id', by.y='id', all.y=T)
right_A_B # A right join B
id A_val B_val
1 3 x x
2 5 c y
3 6 <NA> z
# 通过“id”将AB都保留
full_A_B = merge(x=A, y=B, by.x='id', by.y='id', all=T)
full_A_B # A full outer join B
id A_val B_val
1 1 a <NA>
2 3 x x
3 5 c y
4 6 <NA> z

3. Concatenation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 将A B数据集columns的名字改成统一的
colnames(A) = colnames(B) = c('id','val');
# 然后用rbind()将A B两组数据集通过row合并,B接在A的下面(需要相同的column)
rbind(A,B) # Concatenate vertically
id val
1 1 a
2 3 x
3 5 c
4 3 x
5 5 y
6 6 z

# 将A B数据集通过column合并,B接在A的后面
cbind(A,B) # Concatenate horizontally
id val id val
1 1 a 3 x
2 3 x 5 y
3 5 c 6 z

4. Set Operation交集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# A intersect B
subset(A, (A$id %in% B$id & A$val %in% B$val))
id val
2 3 x
# A except B
subset(A, ! (A$id %in% B$id & A$val %in% B$val))
id val
1 1 a
3 5 c
# B except A
subset(B, ! (B$id %in% A$id & B$val %in% A$val))
id val
2 5 y
3 6 z

5. 用dplyr包来查找交集

1
2
3
4
5
6
7
8
# More efficient row/column binding
dplyr::bind_rows(A, B); dplyr::bind_cols(A, B)
# A intersect/union B
dplyr::intersect(A, B); dplyr::union(A, B);
# A except B; B except A
dplyr::setdiff(A, B); dplyr::setdiff(B, A)
# Observation-level set comparison
dplyr::setequal(A, B)

6. 用SQL语法来合并数据集(sqldf包)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
library(sqldf)
# Inner join
sqldf('select A.id, A.val as A_val, B.val as B_val from A inner join B on A.id = B.id')
# Left outer join
sqldf('select A.id, A.val as A_val, B.val as B_val from A left join B on A.id = B.id')
# Union
sqldf('select * from A union select * from B')

sqldf('select * from A except select * from B')
# id val
# 1 1 a
# 2 5 c
sqldf('select * from A intersect select * from B')
# id val
# 1 3 x
sqldf('select * from
(select * from A union all select * from B) where id > 5')
# id val
# 1 6 z
mtcars = data.frame(mtcars);
sqldf('select row_names, mpg, cyl, wt from mtcars
where row_names like "%Toyota%" ', row.names=T)
# mpg cyl wt
# Toyota Corolla 33.9 4 1.835
# Toyota Corona 21.5 4 2.465