# 数据集A A = data.frame(id=c(1,3,5), A_val=c('a','x','c')) id A_val 1 1 a 2 3 x 3 5 c # 数据集B B = data.frame(id=c(3,5,6), B_val=c('x','y','z')) id B_val 1 3 x 2 5 y 3 6 z # 函数merge( )通过“id”取交集,所以只剩下 id为3 5的 A_B = merge(x=A, y=B, by.x='id', by.y='id') A_B id A_val B_val 1 3 x x 2 5 c y
# 通过“id”将B外接导A上 left_A_B = merge(x=A, y=B, by.x='id', by.y='id', all.x=T) left_A_B # A left join B id A_val B_val 1 1 a <NA> 2 3 x x 3 5 c y # 通过“id”将A外接导B上 right_A_B = merge(x=A, y=B, by.x='id', by.y='id', all.y=T) right_A_B # A right join B id A_val B_val 1 3 x x 2 5 c y 3 6 <NA> z # 通过“id”将AB都保留 full_A_B = merge(x=A, y=B, by.x='id', by.y='id', all=T) full_A_B # A full outer join B id A_val B_val 1 1 a <NA> 2 3 x x 3 5 c y 4 6 <NA> z
3. Concatenation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# 将A B数据集columns的名字改成统一的 colnames(A) = colnames(B) = c('id','val'); # 然后用rbind()将A B两组数据集通过row合并,B接在A的下面(需要相同的column) rbind(A,B) # Concatenate vertically id val 1 1 a 2 3 x 3 5 c 4 3 x 5 5 y 6 6 z
# 将A B数据集通过column合并,B接在A的后面 cbind(A,B) # Concatenate horizontally id val id val 1 1 a 3 x 2 3 x 5 y 3 5 c 6 z
4. Set Operation交集
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# A intersect B subset(A, (A$id %in% B$id & A$val %in% B$val)) id val 2 3 x # A except B subset(A, ! (A$id %in% B$id & A$val %in% B$val)) id val 1 1 a 3 5 c # B except A subset(B, ! (B$id %in% A$id & B$val %in% A$val)) id val 2 5 y 3 6 z
5. 用dplyr包来查找交集
1 2 3 4 5 6 7 8
# More efficient row/column binding dplyr::bind_rows(A, B); dplyr::bind_cols(A, B) # A intersect/union B dplyr::intersect(A, B); dplyr::union(A, B); # A except B; B except A dplyr::setdiff(A, B); dplyr::setdiff(B, A) # Observation-level set comparison dplyr::setequal(A, B)
library(sqldf) # Inner join sqldf('select A.id, A.val as A_val, B.val as B_val from A innerjoin B on A.id = B.id') # Left outer join sqldf('select A.id, A.val as A_val, B.val as B_val from A leftjoin B on A.id = B.id') # Union sqldf('select * from A unionselect * from B') sqldf('select * from A exceptselect * from B') # id val # 1 1 a # 2 5 c sqldf('select * from A intersectselect * from B') # id val # 1 3 x sqldf('select * from (select * from A unionallselect * from B) whereid > 5') # id val # 1 6 z mtcars = data.frame(mtcars); sqldf('select row_names, mpg, cyl, wt from mtcars where row_names like"%Toyota%"', row.names=T) # mpg cyl wt # Toyota Corolla 33.9 4 1.835 # Toyota Corona 21.5 4 2.465