pandas学习笔记强化

阅读数: 次 2021-10-06

pandas学习笔记强化

1.如何处理数据中的缺失值

1.1 判断是否存在缺失值

import pandas as pd
import numpy as np
movie = pd.read_csv('IMDB-Movie-Data.csv')
# 1.判断是否存在缺失值
np.any(pd.isnull(movie))  # 返回True,说明数据中存在缺失值
True
np.all(pd.notnull(movie))  # 返回False,说明数据中存在缺失值
False

pd.isnull(movie).any()  # 返回True,说明数据中存在缺失值
Rank                  False
Title                 False
Genre                 False
Description           False
Director              False
Actors                False
Year                  False
Runtime (Minutes)     False
Rating                False
Votes                 False
Revenue (Millions)     True
Metascore              True
dtype: bool

pd.notnull(movie).all()  # 返回False,说明数据中存在缺失值
Rank                   True
Title                  True
Genre                  True
Description            True
Director               True
Actors                 True
Year                   True
Runtime (Minutes)      True
Rating                 True
Votes                  True
Revenue (Millions)    False
Metascore             False
dtype: bool

1.2 缺失值处理

# 2.缺失值处理
# 方法1：删除含有缺失值的样本
data1 = movie.dropna()
pd.isnull(data1).any()
Rank                  False
Title                 False
Genre                 False
Description           False
Director              False
Actors                False
Year                  False
Runtime (Minutes)     False
Rating                False
Votes                 False
Revenue (Millions)    False
Metascore             False
dtype: bool

# 方法2：替换
# 含有缺失值的字段
# Revenue (Millions)     
# Metascore        
movie['Revenue (Millions)'].fillna(movie['Revenue (Millions)'].mean(),inplace=True)  # nan部分替换为平均值
movie['Metascore'].fillna(movie['Metascore'].mean(),inplace=True)
pd.notnull(movie).all()  # 缺失值已经替换完毕
Rank                  True
Title                 True
Genre                 True
Description           True
Director              True
Actors                True
Year                  True
Runtime (Minutes)     True
Rating                True
Votes                 True
Revenue (Millions)    True
Metascore             True
dtype: bool
movie.head()

2.如何实现数据的离散化

# 1.准备数据
data = pd.Series([165,174,160,180,159,163,192,184], index=['No1:165', 'No2:174','No3:160', 'No4:180', 'No5:159', 'No6:163', 'No7:192', 'No8:184'])
data
No1:165    165
No2:174    174
No3:160    160
No4:180    180
No5:159    159
No6:163    163
No7:192    192
No8:184    184
dtype: int64
# 2.进行分组
# 自动分组
sr = pd.qcut(data,3)
sr
No1:165      (163.667, 178.0]
No2:174      (163.667, 178.0]
No3:160    (158.999, 163.667]
No4:180        (178.0, 192.0]
No5:159    (158.999, 163.667]
No6:163    (158.999, 163.667]
No7:192        (178.0, 192.0]
No8:184        (178.0, 192.0]
dtype: category
Categories (3, interval[float64]): [(158.999, 163.667] < (163.667, 178.0] < (178.0, 192.0]]

1 2	# 3.装换成one-hot编码 pd.get_dummies(sr,prefix='height')

sr.value_counts()  # 分组情况
(178.0, 192.0]        3
(158.999, 163.667]    3
(163.667, 178.0]      2
dtype: int64

# 自定义分组
bins = [150, 165, 180, 195]
sr = pd.cut(data,bins)
sr
No1:165    (150, 165]
No2:174    (165, 180]
No3:160    (150, 165]
No4:180    (165, 180]
No5:159    (150, 165]
No6:163    (150, 165]
No7:192    (180, 195]
No8:184    (180, 195]
dtype: category
Categories (3, interval[int64]): [(150, 165] < (165, 180] < (180, 195]]
sr.value_counts()
(150, 165]    4
(180, 195]    2
(165, 180]    2
dtype: int64
pd.get_dummies(sr,prefix='身高')

3.案例：股票的涨跌幅离散化

1
2
3

# 1.读取数据
stock = pd.read_csv('stock_day.csv')
stock

p_change = stock['p_change']
# 2.自动分组qcut
sr = pd.qcut(p_change,10)
sr.value_counts()
(5.27, 10.03]                    65
(0.26, 0.94]                     65
(-0.462, 0.26]                   65
(-10.030999999999999, -4.836]    65
(2.938, 5.27]                    64
(1.738, 2.938]                   64
(-1.352, -0.462]                 64
(-2.444, -1.352]                 64
(-4.836, -2.444]                 64
(0.94, 1.738]                    63
Name: p_change, dtype: int64
# 3.离散化 get_dummies
pd.get_dummies(sr,prefix='涨跌幅')

# 自定义分组 cut
bins = [-100, -7, -5, -3, 0, 3, 5, 7, 100]
sr1 = pd.cut(p_change,bins)
sr1.value_counts()
(0, 3]        215
(-3, 0]       188
(3, 5]         57
(-5, -3]       51
(7, 100]       35
(5, 7]         35
(-100, -7]     34
(-7, -5]       28
Name: p_change, dtype: int64
stock_change = pd.get_dummies(sr1,prefix='rise') # one-hot
stock_change

4.pd.concat实现合并

1 2	# 进行水平拼接 pd.concat([stock,stock_change],axis=1) # 0为竖直拼接，1位水平拼接

5.pd.merge合并

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                        'key2': ['K0', 'K0', 'K0', 'K0'],
                           'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
left
	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right
	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

pd.merge(left, right, how = 'inner', on = ['key1', 'key2'])
	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2

pd.merge(left, right, how = 'left', on = ['key1', 'key2'])
	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN

pd.merge(left, right, how = 'right', on = ['key1', 'key2'])
   	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2
3	K2	K0	NaN	NaN	C3	D3

6.分组与聚合

col =pd.DataFrame({'color': ['white','red','green','red','green'], 'object': ['pen','pencil','pencil','ashtray','pen'],'price1':[5.56,4.20,1.30,0.56,2.75],'price2':[4.75,4.12,1.60,0.75,3.15]})
col
	color	object	price1	price2
0	white	pen	5.56	4.75
1	red	pencil	4.20	4.12
2	green	pencil	1.30	1.60
3	red	ashtray	0.56	0.75
4	green	pen	2.75	3.15

# 进行分组，对颜色分组，price1进行聚合
# 用dataframe的方法进行分组
col.groupby(by="color")["price1"].max()
color
green    2.75
red      4.20
white    5.56
Name: price1, dtype: float64

col["price1"].groupby(col["color"]).max()
color
green    2.75
red      4.20
white    5.56
Name: price1, dtype: float64

7.综合案例

1
2
3

# 1.准备数据
movie = pd.read_csv('IMDB-Movie-Data.csv')
movie

# 问题1：我们想知道这些电影数据中评分的平均分，导演的人数等信息，我们应该怎么获取？
# 评分的平均分
movie['Rating'].mean()
6.723199999999999
# 导演的人数
np.unique(movie['Director']).size  # unique是去重操作，size是查看数量
644
# 问题2：对于这一组电影数据，如果我们想rating，runtime的分布情况，应该如何呈现数据？
movie['Rating']
0      8.1
1      7.0
2      7.3
3      7.2
4      6.2
      ... 
995    6.2
996    5.5
997    6.2
998    5.6
999    5.3
Name: Rating, Length: 1000, dtype: float64
movie['Rating'].plot(kind='hist',figsize=(20, 8))

# 利用matplotlib来画
import matplotlib.pyplot as plt
# 1.创建画布
plt.figure(figsize=(20, 8),dpi=80)
# 2.绘制直方图
plt.hist(movie['Rating'],20)
# 修改刻度
plt.xticks(np.linspace(movie['Rating'].min(),movie['Rating'].max(),21))
# 添加网格 
plt.grid(linestyle='--', alpha = 0.5)
# 3.显示图像
plt.show()

# 问题3：对于这一组电影数据，如果我们希望统计电影分类(genre)的情况，应该如何处理数据？
# 先统计电影类别都有哪些.
movie_genre = [i.split(',') for i in movie['Genre']]
movie_genre

movie_class = np.unique([j for i in movie_genre for j in i])
movie_class
array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller',
       'War', 'Western'], dtype='<U9')
# 统计每个类别有几个电影
count = pd.DataFrame(np.zeros(shape = [1000, 20], dtype = 'int32'), columns=movie_class)
count