Machine Learning for Trading--1-4 Statistical Analysis and Stock Basics

本篇将介绍股市中的一些基本概念以及如何使用pandas和matplotlib库来对股票信息进行统计分析。

  • 对全局数据进行统计分析–Compute Global Statistics
  • 计算滚动的统计数据–Computing Rolling Statistics
  • 布林带–Bollinger Bands
  • 计算单日回报率-Compute daily returns

对全局数据进行统计分析–Compute Global Statistics

  • 将DataFrame中的所有列数据plot在同一个表格中 – df.plot()
  • 计算所有列的平均数,中位数和方差 – df.mean(), de.median(), df.std()

Example:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
import pandas as pd
import matplotlib.pyplot as plt

def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')

for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])
return df

def plot_data(df, title="Stock prices"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
plt.show()

def test_run():
# Read data
dates = pd.date_range('2010-01-01', '2012-12-31')
symbols = ['SPY', 'XOM', 'GOOG', 'GLD']
df = get_data(symbols, dates)
plot_data(df)

# Compute global statistics for each stock
print (df.mean())
print (df.median())
print (df.std())

if __name__ == "__main__":
test_run()

计算滚动的统计数据–Computing Rolling Statistics

  • 以20天的window计算滚动平均值 – rm_SPY=df[‘SPY’].rolling( window=20).mean()
  • 将rolling mean添加到现有数据的图中 – rm_SPY.plot()

Example:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import pandas as pd
import matplotlib.pyplot as plt

def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')

for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])
return df

def test_run():
# Read data
dates = pd.date_range('2010-01-01', '2012-12-31')
symbols = ['SPY', 'XOM', 'GOOG', 'GLD']
df = get_data(symbols, dates)

# Plot SPY, retain matplotlib axis object
ax = df['SPY'].plot(title="SPY rolling mean", label='SPY')

# Compute rolling mean using a 20-day window
rm_SPY = df['SPY'].rolling( window=20).mean()

# Add rolling mean to same plot
rm_SPY.plot(label='Rolling mean', ax=ax)

# Add axis labels and legend
ax.set_xlabel("Date")
ax.set_ylabel("Price")
ax.legend(loc='upper left')
plt.show()

if __name__ == "__main__":
test_run()

布林带–Bollinger Bands

  • 布林带 – 其基本的型态是由三条轨道线组成的带状通道(中轨和上、下轨各一条)。“中轨”为股价的平均成本,“上轨”和“下轨”可分别视为股价的压力线和支撑线。
  • 布林带中轨 – 滚动平均值 rolling mean
  • 布林带上轨和下轨 – 平均值+/-二倍方差 rolling mean +/- rstd * 2

Example:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import pandas as pd
import matplotlib.pyplot as plt

def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')

for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])

return df

def plot_data(df, title="Stock prices"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
plt.show()

def get_rolling_mean(values, windows):
"""Return rolling mean of given values, using specified window size."""
return values.rolling( window=windows).mean()

def get_rolling_std(values, windows):
"""Return rolling standard deviation of given values, using specified window size."""
# Quiz: Compute and return rolling standard deviation
return values.rolling(window=windows).std()

def get_bollinger_bands(rm, rstd):
"""Return upper and lower Bollinger Bands."""
# Quiz: Compute upper_band and lower_band
upper_band = rm + rstd * 2
lower_band = rm - rstd * 2
return upper_band, lower_band

def test_run():
# Read data
dates = pd.date_range('2012-01-01', '2012-12-31')
symbols = ['SPY']
df = get_data(symbols, dates)

# Compute Bollinger Bands
# 1. Compute rolling mean
rm_SPY = get_rolling_mean(df['SPY'], windows=20)

# 2. Compute rolling standard deviation
rstd_SPY = get_rolling_std(df['SPY'], windows=20)

# 3. Compute upper and lower bands
upper_band, lower_band = get_bollinger_bands(rm_SPY, rstd_SPY)

# Plot raw SPY values, rolling mean and Bollinger Bands
ax = df['SPY'].plot(title="Bollinger Bands", label='SPY')
rm_SPY.plot(label='Rolling mean', ax=ax)
upper_band.plot(label='upper band', ax=ax)
lower_band.plot(label='lower band', ax=ax)

# Add axis labels and legend
ax.set_xlabel("Date")
ax.set_ylabel("Price")
ax.legend(loc='upper left')
plt.show()

if __name__ == "__main__":
test_run()

计算单日回报率-Compute daily returns

  • 回报率计算公式 – daily_ret[t] = (price[t]/prict[t-1]) - 1
  • Pandas有两种方式计算回报率:
    • daily_returns[1:] = (df[1:] / df[:-1].values) - 1
    • daily_returns = (df / df.shift(1)) - 1

Example:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import pandas as pd
import matplotlib.pyplot as plt

def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')

for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])

return df

def plot_data(df, title="Stock prices", xlabel="Date", ylabel="Price"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.show()

def compute_daily_returns(df):
"""Compute and return the daily return values."""
# Quiz: Your code here
# Note: Returned DataFrame must have the same number of rows
daily_returns = df.copy()
# daily_returns[1:] = (df[1:] / df[:-1].values) - 1 # compute daily returns for row 1 onwards
daily_returns = (df / df.shift(1)) - 1 # much easier with Pandas!
daily_returns.iloc[0, :] = 0 # Pandas leaves the 0th row full of Nans
return daily_returns

def test_run():
# Read data
dates = pd.date_range('2012-07-01', '2012-07-31') # one month only
symbols = ['SPY','XOM']
df = get_data(symbols, dates)
plot_data(df)

# Compute daily returns
daily_returns = compute_daily_returns(df)
plot_data(daily_returns, title="Daily returns", ylabel="Daily returns")


if __name__ == "__main__":
test_run()
您的支持将鼓励我继续创作