pandas3

Pandas Introduction 3

Concat and Append

1
2
import pandas as pd 
import numpy as np
1
2
3
4
5
6
7
8
9
10
11
12
def make_df(cols, ind): 
"""一个简单的DataFrame"""
data = {c: [str(c) + str(i) for i in ind]
for c in cols}
return pd.DataFrame(data, ind)
# DataFrame示例
make_df('ABC', range(3))
Out:
A B C
0 A0 B0 C0
1 A1 B1 C1
2 A2 B2 C2

review: numpy.concatenate

1
2
3
4
5
6
7
8
x = [[1, 2], 
[3, 4]]
np.concatenate([x, x], axis=1)
'''
Out:
array([[1, 2, 1, 2],
[3, 4, 3, 4]])
'''

use .concat to combine

1
2
3
4
5
6
7
8
9
10
11
12
13
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3]) 
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])
'''
Out:
1 A
2 B
3 C
4 D
5 E
6 F
dtype: object
'''

and multilevel one:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
df1 = make_df('AB', [1, 2]) 
df2 = make_df('AB', [3, 4])
print(df1); print(df2); print(pd.concat([df1, df2]))
df1 df2 pd.concat([df1, df2])
'''
out:
A B A B A B
1 A1 B1 3 A3 B3 1 A1 B1
2 A2 B2 4 A4 B4 2 A2 B2
3 A3 B3
4 A4 B4
'''

#change the direction
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3); print(df4); print(pd.concat([df3, df4], axis='col'))
'''
out:
df3 df4 pd.concat([df3, df4], axis='col')
A B C D A B C D
0 A0 B0 0 C0 D0 0 A0 B0 C0 D0
1 A1 B1 1 C1 D1 1 A1 B1 C1 D1
'''

when the indexes are the same:

1
2
3
4
5
6
7
8
9
10
11
12
13
x = make_df('AB', [0, 1]) 
y = make_df('AB', [2, 3])
y.index = x.index # 复制索引
print(x); print(y); print(pd.concat([x, y]))
'''
out:
x y pd.concat([x, y])
A B A B A B
0 A0 B0 0 A2 B2 0 A0 B0
1 A1 B1 1 A3 B3 1 A1 B1
0 A2 B2
1 A3 B3
'''

ignore:

1
2
3
4
5
6
7
8
9
print(x); print(y); print(pd.concat([x, y], ignore_index=True)) 
'''
x y pd.concat([x, y], ignore_index=True)
A B A B A B
0 A0 B0 0 A2 B2 0 A0 B0
1 A1 B1 1 A3 B3 1 A1 B1
2 A2 B2
3 A3 B3
'''

add more indexes:

1
2
3
4
5
6
7
8
9
print(x); print(y); print(pd.concat([x, y], keys=['x', 'y'])) 
x y pd.concat([x, y], keys=['x', 'y'])
'''
A B A B A B
0 A0 B0 0 A2 B2 x 0 A0 B0
1 A1 B1 1 A3 B3 1 A1 B1
y 0 A2 B2
1 A3 B3
'''

join:

1
2
3
4
5
6
7
8
9
10
11
df5 = make_df('ABC', [1, 2]) 
df6 = make_df('BCD', [3, 4])
print(df5); print(df6); print(pd.concat([df5, df6])
'''
df5 df6 pd.concat([df5, df6])
A B C B C D A B C D
1 A1 B1 C1 3 B3 C3 D3 1 A1 B1 C1 NaN
2 A2 B2 C2 4 B4 C4 D4 2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
'''

to deal with NaN:

inner and outer:

1
2
3
4
5
6
7
8
9
10
print(df5); print(df6); 
print(pd.concat([df5, df6], join='inner'))
df5 df6 pd.concat([df5, df6], join='inner')
'''
A B C B C D B C
1 A1 B1 C1 3 B3 C3 D3 1 B1 C1
2 A2 B2 C2 4 B4 C4 D4 2 B2 C2
3 B3 C3
4 B4 C4
'''

axe:

1
2
3
4
5
6
7
8
9
10
print(df5); print(df6); 
print(pd.concat([df5, df6], join_axes=[df5.columns]))
'''
df5 df6 pd.concat([df5, df6], join_axes=[df5.columns])
A B C B C D A B C
1 A1 B1 C1 3 B3 C3 D3 1 A1 B1 C1
2 A2 B2 C2 4 B4 C4 D4 2 A2 B2 C2
3 NaN B3 C3
4 NaN B4 C4
'''

append:df1.append(df2) = pd.concat([df1, df2])

merge

pd.merge:

one to one:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'], 
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
print(df1); print(df2)
'''
df1 df2
employee group employee hire_date
0 Bob Accounting 0 Lisa 2004
1 Jake Engineering 1 Bob 2008
2 Lisa Engineering 2 Jake 2012
3 Sue HR 3 Sue 2014
'''

df3 = pd.merge(df1, df2)
df3
'''
Out:
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
'''

many to one:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'], 
'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3); print(df4); print(pd.merge(df3, df4))
'''
df3 df4
employee group hire_date group supervisor
0 Bob Accounting 2008 0 Accounting Carly
1 Jake Engineering 2012 1 Engineering Guido
2 Lisa Engineering 2004 2 HR Steve
3 Sue HR 2014
'''

pd.merge(df3, df4)
'''
employee group hire_date supervisor
0 Bob Accounting 2008 Carly
1 Jake Engineering 2012 Guido
2 Lisa Engineering 2004 Guido
3 Sue HR 2014 Steve
'''

many to many:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting', 
'Engineering', 'Engineering', 'HR', 'HR'],
'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
print(df1); print(df5); print(pd.merge(df1, df5))
'''
df1 df5
employee group group skills
0 Bob Accounting 0 Accounting math
1 Jake Engineering 1 Accounting spreadsheets
2 Lisa Engineering 2 Engineering coding
3 Sue HR 3 Engineering linux
4 HR spreadsheets
5 HR organization
'''

pd.merge(df1, df5)
'''
employee group skills
0 Bob Accounting math
1 Bob Accounting spreadsheets
2 Jake Engineering coding
3 Jake Engineering linux
4 Lisa Engineering coding
5 Lisa Engineering linux
6 Sue HR spreadsheets
7 Sue HR organization
'''

use of on

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
print(df1); print(df2); print(pd.merge(df1, df2, on='employee')) 
df1 df2
'''
employee group employee hire_date
0 Bob Accounting 0 Lisa 2004
1 Jake Engineering 1 Bob 2008
2 Lisa Engineering 2 Jake 2012
3 Sue HR 3 Sue 2014

pd.merge(df1, df2, on='employee')
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
'''

when two dataframes use different column names but the sam data,we can use left_on and right_on

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 'salary': [70000, 80000, 120000, 90000]}) 
print(df1); print(df3);
print(pd.merge(df1, df3, left_on="employee", right_on="name"))
'''
df1 df3
employee group name salary
0 Bob Accounting 0 Bob 70000
1 Jake Engineering 1 Jake 80000
2 Lisa Engineering 2 Lisa 120000
3 Sue HR 3 Sue 90000
pd.merge(df1, df3, left_on="employee", right_on="name")
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000
'''

left_index and right_index

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
df1a = df1.set_index('employee') 
df2a = df2.set_index('employee')
print(df1a); print(df2a)
'''
df1a df2a
group hire_date
employee employee
Bob Accounting Lisa 2004
Jake Engineering Bob 2008
Lisa Engineering Jake 2012
Sue HR Sue 2014
'''

print(df1a); print(df2a);
print(pd.merge(df1a, df2a, left_index=True, right_index=True))
'''
df1a df2a
group hire_date
employee employee
Bob Accounting Lisa 2004
Jake Engineering Bob 2008
Lisa Engineering Jake 2012
Sue HR Sue 2014

pd.merge(df1a, df2a, left_index=True, right_index=True)
group hire_date
employee
Lisa Engineering 2004
Bob Accounting 2008
Jake Engineering 2012
Sue HR 2014
'''

#join
print(df1a); print(df2a); print(df1a.join(df2a))
'''
df1a df2a
group hire_date
employee employee
Bob Accounting Lisa 2004
Jake Engineering Bob 2008
Lisa Engineering Jake 2012
Sue HR Sue 2014

pd.merge(df1a, df2a, left_index=True, right_index=True)
group hire_date
employee
Lisa Engineering 2004
Bob Accounting 2008
Jake Engineering 2012
Sue HR 2014
'''

print(df1a); print(df3);
print(pd.merge(df1a, df3, left_index=True, right_on='name'))
'''
df1 df3
employee group name salary
0 Bob Accounting 0 Bob 70000
1 Jake Engineering 1 Jake 80000
2 Lisa Engineering 2 Lisa 120000
3 Sue HR 3 Sue 90000
pd.merge(df1, df3, left_on="employee", right_on="name")
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000
'''

merge of different dataframes:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'], 
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
print(df6); print(df7); print(pd.merge(df6, df7))
'''
df6 df7 pd.merge(df6, df7)
name food name drink name food drink
0 Peter fish 0 Mary wine 0 Mary bread wine
1 Paul beans 1 Joseph beer
2 Mary bread
'''

pd.merge(df6, df7, how='inner')
# name food drink
# 0 Mary bread wine

#NaN
print(df6); print(df7); print(pd.merge(df6, df7, how='outer'))
df6 df7 pd.merge(df6, df7, how='outer')
'''
name food name drink name food drink
0 Peter fish 0 Mary wine 0 Peter fish NaN
1 Paul beans 1 Joseph beer 1 Paul beans NaN
2 Mary bread 2 Mary bread wine
3 Joseph NaN beer
'''

#left join,right join
print(df6); print(df7); print(pd.merge(df6, df7, how='left'))
df6 df7 pd.merge(df6, df7, how='left')
'''
name food name drink name food drink
0 Peter fish 0 Mary wine 0 Peter fish NaN
1 Paul beans 1 Joseph beer 1 Paul beans NaN
2 Mary bread 2 Mary bread wine
'''

suffixes: the column names are repeated

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 
'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
print(df8); print(df9); print(pd.merge(df8, df9, on="name"))
'''
df8 df9 pd.merge(df8, df9, on="name")
name rank name rank name rank_x rank_y
0 Bob 1 0 Bob 3 0 Bob 1 3
1 Jake 2 1 Jake 1 1 Jake 2 1
2 Lisa 3 2 Lisa 4 2 Lisa 3 4
3 Sue 4 3 Sue 2 3 Sue 4 2
'''

print(df8); print(df9);
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))
'''
df8 df9 pd.merge(df8, df9, on="name", suffixes=["_L", "_R"])
name rank name rank name rank_L rank_R
0 Bob 1 0 Bob 3 0 Bob 1 3
1 Jake 2 1 Jake 1 1 Jake 2 1
2 Lisa 3 2 Lisa 4 2 Lisa 3 4
3 Sue 4 3 Sue 2 3 Sue 4 2
'''