与SQL比较
常用方法
### SELECT
tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
### WHERE
tips[tips['time'] == 'Dinner'].head(5)
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
frame[frame['col2'].isna()]
frame[frame['col1'].notna()]
### GROUP BY
tips.groupby('sex')['total_bill'].count()
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})
### JOIN
#### INNER JOIN
pd.merge(df1, df2, on='key')
#### LEFT OUTER JOIN
pd.merge(df1, df2, on='key', how='left')
#### RIGHT JOIN
pd.merge(df1, df2, on='key', how='right')
#### FULL JOIN
pd.merge(df1, df2, on='key', how='outer')
### UNION
pd.concat([df1, df2])
pd.concat([df1, df2]).drop_duplicates()
### 更新(UPDATE)
tips.loc[tips['tip'] < 2, 'tip'] *= 2
### 删除(DELETE)
tips = tips.loc[tips['tip'] <= 9]
Pandas等同于某些SQL分析和聚合函数
每组前N行
-- Oracle's ROW_NUMBER() analytic function
SELECT * FROM (
SELECT
t.*,
ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
FROM tips t
)
WHERE rn < 3
ORDER BY day, rn;
In [35]: (tips.assign(rn=tips.sort_values(['total_bill'], ascending=False)
....: .groupby(['day'])
....: .cumcount() + 1)
....: .query('rn < 3')
....: .sort_values(['day', 'rn']))
....:
Out[35]:
total_bill tip sex smoker day time size rn
95 40.17 4.73 Male Yes Fri Dinner 4 1
90 28.97 3.00 Male Yes Fri Dinner 2 2
170 50.81 10.00 Male Yes Sat Dinner 3 1
212 48.33 9.00 Male No Sat Dinner 4 2
156 48.17 5.00 Male No Sun Dinner 6 1
182 45.35 3.50 Male Yes Sun Dinner 3 2
197 43.11 5.00 Female Yes Thur Lunch 4 1
142 41.19 5.00 Male No Thur Lunch 5 2
同样使用 rank (method ='first') 函数
In [36]: (tips.assign(rnk=tips.groupby(['day'])['total_bill']
....: .rank(method='first', ascending=False))
....: .query('rnk < 3')
....: .sort_values(['day', 'rnk']))
....:
Out[36]:
total_bill tip sex smoker day time size rnk
95 40.17 4.73 Male Yes Fri Dinner 4 1.0
90 28.97 3.00 Male Yes Fri Dinner 2 2.0
170 50.81 10.00 Male Yes Sat Dinner 3 1.0
212 48.33 9.00 Male No Sat Dinner 4 2.0
156 48.17 5.00 Male No Sun Dinner 6 1.0
182 45.35 3.50 Male Yes Sun Dinner 3 2.0
197 43.11 5.00 Female Yes Thur Lunch 4 1.0
142 41.19 5.00 Male No Thur Lunch 5 2.0
-- Oracle's RANK() analytic function
SELECT * FROM (
SELECT
t.*,
RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
FROM tips t
WHERE tip < 2
)
WHERE rnk < 3
ORDER BY sex, rnk;
让我们找到每个性别组(等级<3)的提示(提示<2)。请注意,使用rank(method='min')
函数时
rnk_min对于相同的提示保持不变
(如Oracle的RANK()函数)
In [37]: (tips[tips['tip'] < 2]
....: .assign(rnk_min=tips.groupby(['sex'])['tip']
....: .rank(method='min'))
....: .query('rnk_min < 3')
....: .sort_values(['sex', 'rnk_min']))
....:
Out[37]:
total_bill tip sex smoker day time size rnk_min
67 3.07 1.00 Female Yes Sat Dinner 1 1.0
92 5.75 1.00 Female Yes Fri Dinner 2 1.0
111 7.25 1.00 Female No Sat Dinner 1 1.0
236 12.60 1.00 Male Yes Sat Dinner 2 1.0
237 32.83 1.17 Male Yes Sat Dinner 2 2.0