本文共 9235 字,大约阅读时间需要 30 分钟。
import numpy as npimport pandas as pdimport pandas_datareader.data as webimport datetime# 爬取数据df = web.DataReader("AAPL","yahoo",datetime.datetime(2019,4,1),datetime.date.today())# 展示数据df
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800 | 195.350006 |
2019-04-04 | 196.369995 | 193.139999 | 194.789993 | 195.690002 | 19114300 | 195.690002 |
2019-04-05 | 197.100006 | 195.929993 | 196.449997 | 197.000000 | 18526600 | 197.000000 |
2019-04-08 | 200.229996 | 196.339996 | 196.419998 | 200.100006 | 25881700 | 200.100006 |
2019-04-09 | 202.850006 | 199.229996 | 200.320007 | 199.500000 | 35768200 | 199.500000 |
2019-04-10 | 200.740005 | 198.179993 | 198.679993 | 200.619995 | 21695300 | 200.619995 |
2019-04-11 | 201.000000 | 198.440002 | 200.850006 | 198.949997 | 20900800 | 198.949997 |
2019-04-12 | 200.139999 | 196.210007 | 199.199997 | 198.869995 | 27744300 | 198.869995 |
df_drop = df.dropna()df_drop.head()
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800 | 195.350006 |
2019-04-04 | 196.369995 | 193.139999 | 194.789993 | 195.690002 | 19114300 | 195.690002 |
2019-04-05 | 197.100006 | 195.929993 | 196.449997 | 197.000000 | 18526600 | 197.000000 |
df_fillna = df.fillna(value=0)df_fillna.head()
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800 | 195.350006 |
2019-04-04 | 196.369995 | 193.139999 | 194.789993 | 195.690002 | 19114300 | 195.690002 |
2019-04-05 | 197.100006 | 195.929993 | 196.449997 | 197.000000 | 18526600 | 197.000000 |
df_isnull = pd.isnull(df)df_isnull.head()
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | False | False | False | False | False | False |
2019-04-02 | False | False | False | False | False | False |
2019-04-03 | False | False | False | False | False | False |
2019-04-04 | False | False | False | False | False | False |
2019-04-05 | False | False | False | False | False | False |
# 列计算平均值df.mean()
High 1.981070e+02Low 1.950050e+02Open 1.962690e+02Close 1.971340e+02Volume 2.435307e+07Adj Close 1.971340e+02dtype: float64
# 行计算平均值df.mean(1)
Date2019-04-01 4.643826e+062019-04-02 3.794444e+062019-04-03 3.878796e+062019-04-04 3.185879e+062019-04-05 3.087931e+062019-04-08 4.313782e+062019-04-09 5.961534e+062019-04-10 3.616050e+062019-04-11 3.483633e+062019-04-12 4.624216e+06dtype: float64
#skipna参数默认是True 表示排除缺失值df.mean(axis = 1,skipna = False)
Date2019-04-01 4.643826e+062019-04-02 3.794444e+062019-04-03 3.878796e+062019-04-04 3.185879e+062019-04-05 3.087931e+062019-04-08 4.313782e+062019-04-09 5.961534e+062019-04-10 3.616050e+062019-04-11 3.483633e+062019-04-12 4.624216e+06dtype: float64
# 行名字排序sorted_row_df = df.sort_index()sorted_row_df.head()
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800 | 195.350006 |
2019-04-04 | 196.369995 | 193.139999 | 194.789993 | 195.690002 | 19114300 | 195.690002 |
2019-04-05 | 197.100006 | 195.929993 | 196.449997 | 197.000000 | 18526600 | 197.000000 |
# 列名字排序sorted_col_df = df.sort_index(axis=1)sorted_col_df.head()
Adj Close | Close | High | Low | Open | Volume | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.240005 | 191.240005 | 191.679993 | 188.380005 | 191.639999 | 27862000 |
2019-04-02 | 194.020004 | 194.020004 | 194.460007 | 191.050003 | 191.089996 | 22765700 |
2019-04-03 | 195.350006 | 195.350006 | 196.500000 | 193.149994 | 193.250000 | 23271800 |
2019-04-04 | 195.690002 | 195.690002 | 196.369995 | 193.139999 | 194.789993 | 19114300 |
2019-04-05 | 197.000000 | 197.000000 | 197.100006 | 195.929993 | 196.449997 | 18526600 |
常用的方法如上所介绍,还有很多其他的,下面罗列了一些,可供参考:
count 非na值的数量
describe 针对Series或DataFrame列计算汇总统计
min、max 计算最小值和最大值
argmin、argmax 计算能够获取到最大值和最小值得索引位置
idxmin、idxmax 计算能够获取到最大值和最小值得索引值
quantile 计算样本的分位数(0到1)
sum 值的总和
mean 值的平均数
median 值的算术中位数
mad 根据平均值计算平均绝对离差
var 样本值的方差
std 样本值的标准差
skew 样本值的偏度(三阶矩)
kurt 样本值的累积和
cumsum 样本值的累积和
cummin,cummax 样本值的累计最大值和累计最小值
cumprod 样本值的累计积
diff 计算一阶差分
pct_change 计算百分数变化
Pandas提供了大量的方法能够轻松的对Series,DataFrame和Pannel对象进行各种符号各种逻辑关系的合并操作
concat 可以沿一条轴将多个对象堆叠到一起
append 将一行连接到一个DataFrame上
duolicated 移除重复数据
df1 = web.DataReader("AAPL","yahoo",datetime.datetime(2019,4,1),datetime.datetime(2019,4,3))df1
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800 | 195.350006 |
df2 = web.DataReader("AAPL","yahoo",datetime.datetime(2019,4,5),datetime.datetime(2019,4,8))df2
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-04 | 196.369995 | 193.139999 | 194.789993 | 195.690002 | 19114300 | 195.690002 |
2019-04-05 | 197.100006 | 195.929993 | 196.449997 | 197.000000 | 18526600 | 197.000000 |
2019-04-08 | 200.229996 | 196.339996 | 196.419998 | 200.100006 | 25881700 | 200.100006 |
纵向拼接(默认):
pd.concat([df1,df2],axis=0)
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800 | 195.350006 |
2019-04-04 | 196.369995 | 193.139999 | 194.789993 | 195.690002 | 19114300 | 195.690002 |
2019-04-05 | 197.100006 | 195.929993 | 196.449997 | 197.000000 | 18526600 | 197.000000 |
2019-04-08 | 200.229996 | 196.339996 | 196.419998 | 200.100006 | 25881700 | 200.100006 |
横向拼接,index对不上的用NaN填充
pd.concat([df1,df2],axis=1)
High | Low | Open | Close | Volume | Adj Close | High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 | NaN | NaN | NaN | NaN | NaN | NaN |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700.0 | 194.020004 | NaN | NaN | NaN | NaN | NaN | NaN |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800.0 | 195.350006 | NaN | NaN | NaN | NaN | NaN | NaN |
2019-04-04 | NaN | NaN | NaN | NaN | NaN | NaN | 196.369995 | 193.139999 | 194.789993 | 195.690002 | 19114300.0 | 195.690002 |
2019-04-05 | NaN | NaN | NaN | NaN | NaN | NaN | 197.100006 | 195.929993 | 196.449997 | 197.000000 | 18526600.0 | 197.000000 |
2019-04-08 | NaN | NaN | NaN | NaN | NaN | NaN | 200.229996 | 196.339996 | 196.419998 | 200.100006 | 25881700.0 | 200.100006 |
df1
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800 | 195.350006 |
s = df1.iloc[0]s
High 1.916800e+02Low 1.883800e+02Open 1.916400e+02Close 1.912400e+02Volume 2.786200e+07Adj Close 1.912400e+02Name: 2019-04-01 00:00:00, dtype: float64
# ignore_index=False 表示索引不变df1.append(s,ignore_index=False)
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700.0 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800.0 | 195.350006 |
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 |
# ignore_index=True 表示索引重置df1.append(s,ignore_index=True)
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
0 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 |
1 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700.0 | 194.020004 |
2 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800.0 | 195.350006 |
3 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 |
z = df1.append(s,ignore_index=False)z
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700.0 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800.0 | 195.350006 |
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 |
查看重复数据
z.duplicated()
Date2019-04-01 False2019-04-02 False2019-04-03 False2019-04-01 Truedtype: bool
移除重复数据
z.drop_duplicates()
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-04-01 | 191.679993 | 188.380005 | 191.639999 | 191.240005 | 27862000.0 | 191.240005 |
2019-04-02 | 194.460007 | 191.050003 | 191.089996 | 194.020004 | 22765700.0 | 194.020004 |
2019-04-03 | 196.500000 | 193.149994 | 193.250000 | 195.350006 | 23271800.0 | 195.350006 |
z.groupby("Open").sum()
High | Low | Close | Volume | Adj Close | |
---|---|---|---|---|---|
Open | |||||
191.089996 | 194.460007 | 191.050003 | 194.020004 | 22765700.0 | 194.020004 |
191.639999 | 383.359985 | 376.760010 | 382.480011 | 55724000.0 | 382.480011 |
193.250000 | 196.500000 | 193.149994 | 195.350006 | 23271800.0 | 195.350006 |
z.groupby(["Open","Close"]).sum()
High | Low | Volume | Adj Close | ||
---|---|---|---|---|---|
Open | Close | ||||
191.089996 | 194.020004 | 194.460007 | 191.050003 | 22765700.0 | 194.020004 |
191.639999 | 191.240005 | 383.359985 | 376.760010 | 55724000.0 | 382.480011 |
193.250000 | 195.350006 | 196.500000 | 193.149994 | 23271800.0 | 195.350006 |
转载地址:http://jdzdf.baihongyu.com/