python
murata not public netkeiba scraping
import pandas as pd
import time
# プãƒã‚°ãƒ¬ã‚¹ãƒãƒ¼ã®è¡¨ç¤º
from tqdm import tqdm_notebook as tqdm
# 対象ã®urlã®æ§‹é€ 分æž
# 'https://db.netkeiba.com/race/202006030301'
# ex.
# 202006030301ã«ã¤ã„ã¦
# 2020 → 西暦
# 06 → é–‹å‚¬å ´ã€€ï¼ˆ'æœå¹Œ', '函館', 'ç¦å³¶', '新潟', 'æ±äº¬', 'ä¸å±±', 'ä¸äº¬', '京都', '阪神', 'å°å€‰')
# 03 → ä¸Šè¨˜é–‹å‚¬å ´ã®ä½•å›žç›®
# 03 → ä¸Šè¨˜é–‹å‚¬å ´ã®ä¸Šè¨˜å›žã®ä½•æ—¥ç›®
# 01 → レース番å·
race_id_list = []
# 開催上
for place in range(1,11,1):
# ~回
for kai in range(1,6,1):
# ~日
for day in range(1,9,1):
# ~R
for r in range(1,13,1):
race_id = "2019"+ str(place).zfill(2)+str(kai).zfill(2)+str(day).zfill(2) + str(r).zfill(2)
race_id_list.append(race_id)
# 途ä¸ä¸æ–ã—ã€å†é–‹ã—ãŸæ™‚を想定ã—ã€èªã¿è¾¼ã¿æ¸ˆã¿ã®çµæžœã‚’第2引数ã«
def scrape_race_results(race_id_list,pre_race_results={}):
# æ—¢ã«èªã¿è¾¼ã¿æ¸ˆã®çµæžœã‚’åæ˜
race_results = pre_race_results
# tqdmã«ã‚ˆã‚Šã€é€²æ—を表示ã•ã›ã‚‹
for race_id in tqdm(race_id_list):
# レースIDãŒèªã¿è¾¼ã¿æ¸ˆã®ãƒªã‚¹ãƒˆå†…ã«ã‚ã‚‹å ´åˆã¯ã€ã“ã“ã§å‡¦ç†ã‚’ä¸æ–ã—ã€foræ–‡ã¸æˆ»ã™
if race_id in race_results.keys():
print(race_id)
continue
try:
url = "https://db.netkeiba.com/race/" + race_id
race_results[race_id] = pd.read_html(url)[0]
time.sleep(1)
# 該当ã®IDã«ãƒ¬ãƒ¼ã‚¹æƒ…å ±ãŒãªã‹ã£ãŸå ´åˆã«ã‚‚ä¸æ–ã—ãªã„よã†ã«ã™ã‚‹ã€‚
except IndexError:
continue
# スクレイピング自体をä¸æ–ã—ã¦ã‚‚ã€race_resultsã‚’è¿”ã—ã¦ãれるよã†ã«ã™ã‚‹ã€‚
except:
break
return race_results
# テストä¸æ–ã—ã¦å®Ÿè¡Œã™ã‚‹ã¨ãã«ã€å·¦è¾ºã¨å³è¾ºã‚’ãã‚Œãžã‚Œï¼‘ãšã¤ãšã‚‰ã—ã¦å®Ÿè¡Œã™ã‚‹ï¼ˆex. test3 = ~ test2)
test2 = scrape_race_results(race_id_list,test)
# å–å¾—ã—ãŸå„レースã®çµæžœãƒ‡ãƒ¼ã‚¿ï¼ˆä»Šã¯è¾žæ›¸åž‹ã§ãã‚Œãžã‚Œåˆ†ã‹ã‚Œã¦ã„る)をã¤ãªã’る処ç†
# ã¤ãªã’ãŸéš›ã«è˜åˆ¥ã§ãるよã†ã«ã€IDをレースIDã«ã—ã¦ãŠã。
# å…¨ã¦ã®ãƒ¬ãƒ¼ã‚¹IDを一ã¤ãšã¤å–り出ã—ã€
for key in test.keys():
# å„ç€ç‰ˆã”ã¨ã«ã¤ã‘られã¦ã„るインデックスをレースIDã«å¤‰æ›ã€‚é•·ã•ã¯å„レースã®ãƒ‡ãƒ¼ã‚¿ãƒ•ãƒ¬ãƒ¼ãƒ ã®é•·ã•ï¼ˆé¦¬ã®æ•°ï¼‰åˆ†ã ã‘ç½®ãæ›ãˆã‚‹ã€‚
test[key].index = [key]*len(test[key])
# å…¨ã¦ã‚’ã¤ãªã’ã€ï¼‘ã¤ã®ãƒ‡ãƒ¼ã‚¿ãƒ•ãƒ¬ãƒ¼ãƒ ã«ã¾ã¨ã‚る。行ãŒå…¥ã‚Œæ›¿ã‚らãªã„よã†ã«sort=Falseã‚’è¨å®šã€‚
results = pd.concat([test[key] for key in test.keys()], sort=False)
# pickleå½¢å¼ã§ä¿å˜
results.to_pickle('results.pickle')
# csvã§ä¿å˜
results.to_csv("results.csv", encoding='utf_8_sig')
Was this helpful?
Similar Posts
- murata not public netkeiba data processing function
- Target database is not up to date - Alembic
- Error - Building wheel for backports.zoneinfo (pyproject.toml) did not run successfully [Solved]
- murata htmlã®æ–‡å—変æ›ï¼ˆK,M →数å—)
- murata フォルダ内ã®ãƒ•ã‚¡ã‚¤ãƒ«ã‚’csvã«ã¾ã¨ã‚ã‚‹
- murata ファイルを拡張åã”ã¨ã«ãƒ•ã‚©ãƒ«ãƒ€ã¸æŒ¯ã‚Šåˆ†ã‘
- murata é€”ä¸ è¦æ”¹å–„ pyautoguiを使ã£ãŸExcelã‹ã‚‰ãƒ†ã‚ストファイルã¸ã®è»¢è¨˜