我正在尝试使用 python 从维基百科访问数据集,代码的目的是访问 S&p500 公司的表并将数据集提取到一个 csv 文件中(每个公司数据在一个 csv 文件中),其中一些数据很好已访问,但我遇到了套接字异常,我觉得这有点难以理解。我正在提供我的完整代码
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
def save_sp500_tickers():
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[0].text
tickers.append(ticker)
with open("sp500tickers.pickle","wb") as f:
pickle.dump(tickers,f)
return tickers
#save_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500:
tickers = save_sp500_tickers()
else:
with open("sp500tickers.pickle","rb") as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(2000, 1, 1)
end = dt.datetime(2016, 12, 31)
for ticker in tickers:
# just in case your connection breaks, we'd like to save our progress!
if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
df = web.DataReader(ticker, "yahoo", start, end)
df.to_csv('stock_dfs/{}.csv'.format(ticker))
else:
print('Already have {}'.format(ticker))
get_data_from_yahoo()
我得到如下异常
Traceback (most recent call last):
File "C:\Users\Jeet Chatterjee\Data Analysis With Python for finance\op6.py", line 49, in <module>
get_data_from_yahoo()
File "C:\Users\Jeet Chatterjee\Data Analysis With Python for finance\op6.py", line 44, in get_data_from_yahoo
df = web.DataReader(ticker, "yahoo", start, end)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas_datareader\data.py", line 121, in DataReader
session=session).read()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas_datareader\yahoo\daily.py", line 115, in read
df = super(YahooDailyReader, self).read()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas_datareader\base.py", line 181, in read
params=self._get_params(self.symbols))
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas_datareader\base.py", line 79, in _read_one_data
out = self._read_url_as_StringIO(url, params=params)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas_datareader\base.py", line 90, in _read_url_as_StringIO
response = self._get_response(url, params=params)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\pandas_datareader\base.py", line 139, in _get_response
raise RemoteDataError('Unable to read URL: {0}'.format(url))
pandas_datareader._utils.RemoteDataError: Unable to read URL: https://query1.finance.yahoo.com/v7/finance/download/AGN?period1=946665000&period2=1483208999&interval=1d&events=history&crumb=6JtBOAj%5Cu002F6EP
请帮我解决这个问题,提前致谢
您所做的没有太大问题,一个问题是 Yahoo 时间序列数据不能保证 100% 的时间可用,它确实倾向于出现和消失。我刚刚查看了雅虎网站;虽然 Allergan (AGN) 似乎没有问题,这是对您来说失败的那个,但当时我尝试 Brown Forman (BF.B) 和 Berkshire Hathaway B (BRK.B) 不可用。
另一个问题是,您不能假设标准普尔 500 指数上的每个交易品种都有您硬编码的范围内的时间序列数据;有些只存在于 2017 年。
以下是代码的略微修改版本,它尽最大努力获取所有符号,请求从 2000 年 1 月 1 日到当天的数据,如果雅虎没有可用数据,则放弃。
在撰写本文时,它能够获取标准普尔 500 指数当前 505 个品种中的 503 个的时间序列。注意我使用了代理服务器,您可以删除或注释掉这部分代码。
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
# proxy servers for internet connection
proxies = {
'http': 'http://my.proxy.server:8080',
'https': 'https://my.proxy.server:8080',
}
symbol_filename = "sp500tickers.pickle"
def save_sp500_tickers():
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies', proxies=proxies)
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[0].text
tickers.append(ticker)
with open(symbol_filename,"wb") as f:
pickle.dump(tickers,f)
return tickers
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500 or not os.path.exists(symbol_filename):
tickers = save_sp500_tickers()
else:
with open(symbol_filename,"rb") as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(2000, 1, 1)
end = dt.datetime(dt.date.today().year, dt.date.today().month, dt.date.today().day)
for ticker in tickers:
if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
try:
print ticker
df = web.DataReader(ticker, "yahoo", start, end)
df.to_csv('stock_dfs/{}.csv'.format(ticker))
except:
print ("No timeseries available for " + ticker)
else:
pass # print('Already have {}'.format(ticker))
os.environ["HTTP_PROXY"]=proxies['http']
os.environ["HTTPS_PROXY"]=proxies['https']
get_data_from_yahoo()
希望这是有帮助的。
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句