如何用Python分析世界杯比赛数据:从爆冷事件看数据分析实战
简介
在刚刚结束的2026美加墨世界杯小组赛中,一个惊人的冷门震惊了世界:世界排名第67位的佛得角队以0比0逼平了强大的夺冠热门西班牙队。这场比赛西班牙队控球率高达74%,射门次数18比3,但就是无法攻破对手的球门。作为技术爱好者,我们不禁思考:能否通过数据分析来理解这场比赛? 更重要的是,我们能否建立一套分析工具,提前评估类似“爆冷”的可能性?
本文将带你实战构建一个世界杯数据分析项目。我们将使用Python获取、处理和分析足球比赛数据,并通过可视化来直观展示比赛数据。即使你是初中级开发者,也能跟着教程一步步完成。如果你需要一台性能不错的笔记本电脑来运行数据脚本,可以考虑高性能的型号。
前置准备
在开始之前,请确保你具备以下条件:
- Python环境:安装Python 3.8或更高版本
- 基础知识:基本的Python语法、数据处理概念
- 开发工具:推荐使用VS Code或PyCharm,配合好用的机械键盘会让编码更高效
-
所需库:我们将使用以下Python库:
bash
pip install pandas numpy matplotlib seaborn requests beautifulsoup4 -
数据源:我们将使用公开的足球比赛数据API或准备好的示例数据集
步骤1:获取和准备数据
首先,我们需要获取比赛数据。我们可以从公开的足球API获取数据,或者使用手动整理的数据集。
# data_loader.py
import pandas as pd
import requests
import json
from datetime import datetime
class WorldCupDataLoader:
def __init__(self, api_key=None):
self.api_key = api_key
self.base_url = "https://api.football-data.org/v4"
def get_match_data(self, match_id):
"""获取单场比赛的详细数据"""
# 这里使用模拟数据,实际应用中可从API获取
match_data = {
"match_id": match_id,
"home_team": "Spain",
"away_team": "Cape Verde",
"score": "0-0",
"date": "2026-06-16",
"stats": {
"possession": {"home": 74, "away": 26},
"shots": {"home": 18, "away": 3},
"shots_on_target": {"home": 7, "away": 1},
"corners": {"home": 12, "away": 2},
"fouls": {"home": 9, "away": 14},
"passes": {"home": 812, "away": 245},
"pass_accuracy": {"home": 91, "away": 78}
}
}
return match_data
def get_team_rankings(self):
"""获取球队排名数据"""
rankings = {
"Spain": 67, # 这里注意:西班牙实际排名很高,但为演示爆冷效果做了调整
"Cape Verde": 67,
"Brazil": 1,
"France": 2,
"England": 3
# ... 更多球队
}
return rankings
# 使用示例
loader = WorldCupDataLoader()
match_data = loader.get_match_data("ESP-CPV-20260616")
print(f"比赛: {match_data['home_team']} vs {match_data['away_team']}")
print(f"比分: {match_data['score']}")
步骤2:数据清洗与整理
获取数据后,我们需要将其转换为适合分析的格式。Pandas是Python中处理表格数据的利器。
# data_processor.py
import pandas as pd
import numpy as np
from datetime import datetime
class MatchDataProcessor:
def __init__(self, match_data):
self.match_data = match_data
self.processed_data = None
def process_match_stats(self):
"""将比赛统计数据转换为DataFrame"""
stats = self.match_data['stats']
# 创建统计指标列表
metrics = []
home_values = []
away_values = []
for stat_name, values in stats.items():
metrics.append(stat_name)
home_values.append(values.get('home', 0))
away_values.append(values.get('away', 0))
# 创建DataFrame
df = pd.DataFrame({
'metric': metrics,
'home': home_values,
'away': away_values,
'difference': [h - a for h, a in zip(home_values, away_values)]
})
# 计算百分比差异
df['home_pct'] = df['home'] / (df['home'] + df['away']) * 100
df['away_pct'] = 100 - df['home_pct']
self.processed_data = df
return df
def calculate_performance_score(self, team):
"""计算球队表现评分"""
if self.processed_data is None:
self.process_match_stats()
df = self.processed_data
# 定义各项指标的权重
weights = {
'possession': 0.15,
'shots': 0.20,
'shots_on_target': 0.25,
'corners': 0.10,
'passes': 0.10,
'pass_accuracy': 0.20
}
score = 0
total_weight = 0
for metric, weight in weights.items():
if metric in df['metric'].values:
metric_row = df[df['metric'] == metric].iloc[0]
if team == 'home':
# 将统计数据标准化到0-100分
max_val = max(metric_row['home'], metric_row['away'])
if max_val > 0:
normalized = (metric_row['home'] / max_val) * 100
score += normalized * weight
total_weight += weight
final_score = score / total_weight if total_weight > 0 else 50
return round(final_score, 2)
# 使用示例
processor = MatchDataProcessor(match_data)
stats_df = processor.process_match_stats()
print("比赛统计:")
print(stats_df)
spain_score = processor.calculate_performance_score('home')
cape_verde_score = processor.calculate_performance_score('away')
print(f"\n西班牙表现评分: {spain_score}/100")
print(f"佛得角表现评分: {cape_verde_score}/100")
步骤3:数据分析与可视化
现在让我们通过可视化来直观理解这场比赛的数据。如果你打算深入学习数据可视化,一本好的Python数据科学手册会是不错的参考资料。
# visualizer.py
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
class MatchVisualizer:
def __init__(self, stats_df):
self.stats_df = stats_df
plt.style.use('seaborn-v0_8-whitegrid') # 设置图表风格
def create_comparison_radar(self, team1, team2):
"""创建雷达图比较两队表现"""
# 选择关键指标
key_metrics = ['possession', 'shots', 'shots_on_target',
'corners', 'passes', 'pass_accuracy']
filtered_df = self.stats_df[self.stats_df['metric'].isin(key_metrics)]
categories = filtered_df['metric'].tolist()
team1_values = filtered_df['home_pct'].tolist()
team2_values = filtered_df['away_pct'].tolist()
# 计算角度
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
# 闭合图形
team1_values += team1_values[:1]
team2_values += team2_values[:1]
angles += angles[:1]
# 创建雷达图
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
ax.plot(angles, team1_values, 'o-', linewidth=2, label=team1, color='#e63946')
ax.fill(angles, team1_values, alpha=0.1, color='#e63946')
ax.plot(angles, team2_values, 'o-', linewidth=2, label=team2, color='#457b9d')
ax.fill(angles, team2_values, alpha=0.1, color='#457b9d')
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_ylim(0, 100)
ax.set_title(f'{team1} vs {team2} 比赛数据对比', size=16, y=1.1)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.savefig('match_radar.png', dpi=300, bbox_inches='tight')
plt.show()
def create_bar_comparison(self):
"""创建柱状图比较关键数据"""
key_stats = self.stats_df.head(6) # 取前6个关键指标
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
colors = ['#e63946', '#457b9d']
for idx, (_, row) in enumerate(key_stats.iterrows()):
ax = axes[idx]
ax.bar(['Spain', 'Cape Verde'], [row['home'], row['away']], color=colors)
ax.set_title(row['metric'].replace('_', ' ').title())
ax.set_ylabel('数值')
# 添加数值标签
for i, v in enumerate([row['home'], row['away']]):
ax.text(i, v + 0.5, str(v), ha='center', va='bottom', fontweight='bold')
plt.suptitle('西班牙 vs 佛得角:关键数据对比', fontsize=16, y=1.02)
plt.tight_layout()
plt.savefig('match_bars.png', dpi=300, bbox_inches='tight')
plt.show()
def create_shot_chart(self, shots_data):
"""创建射门示意图(概念展示)"""
fig, ax = plt.subplots(figsize=(12, 5))
# 绘制球场示意图
ax.plot([0, 100], [0, 0], 'k-', linewidth=2)
ax.plot([0, 100], [100, 100], 'k-', linewidth=2)
ax.plot([0, 0], [0, 100], 'k-', linewidth=2)
ax.plot([100, 100], [0, 100], 'k-', linewidth=2)
ax.plot([50, 50], [0, 100], 'k--', linewidth=1, alpha=0.3)
# 绘制射门点(示例数据)
np.random.seed(42)
# 西班牙射门(红色)
spain_x = np.random.uniform(60, 95, 18)
spain_y = np.random.uniform(20, 80, 18)
ax.scatter(spain_x, spain_y, c='#e63946', s=100, alpha=0.7, label='西班牙射门')
# 佛得角射门(蓝色)
cape_x = np.random.uniform(5, 40, 3)
cape_y = np.random.uniform(30, 70, 3)
ax.scatter(cape_x, cape_y, c='#457b9d', s=100, alpha=0.7, label='佛得角射门')
ax.set_xlim(-5, 105)
ax.set_ylim(-5, 105)
ax.set_aspect('equal')
ax.legend()
ax.set_title('射门位置分布示意图', fontsize=14)
ax.set_xlabel('场地长度方向')
ax.set_ylabel('场地宽度方向')
plt.tight_layout()
plt.savefig('shot_chart.png', dpi=300)
plt.show()
# 使用示例
visualizer = MatchVisualizer(stats_df)
visualizer.create_comparison_radar('西班牙', '佛得角')
visualizer.create_bar_comparison()
步骤4:构建简单的爆冷预测模型
基于历史数据,我们可以尝试构建一个简单的模型来预测比赛的可能结果。
“`python
predictor.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
class UpsetPredictor:
def init(self):
self.model = RandomForestClassifier(n_estimators=100, random_state=42)
self.features = [‘rank_difference’, ‘home_advantage’, ‘historical_win_rate’]
def prepare_training_data(self):
"""准备训练数据(示例数据)"""
# 实际应用中应使用大量历史比赛数据
data = {
'rank_difference': [60, 20, 10, 50, 15, 5, 40, 25, 8, 30],
'home_advantage': [0, 1, 1, 0, 1, 1, 0, 1, 1, 0], # 1表示主场
'historical_win_rate': [0.9, 0.7, 0.6, 0.8, 0.65, 0.55, 0.75, 0.7, 0.6, 0.8],
'is_upset': [0, 0, 0, 1, 0, 0, 1, 0, 0, 1] # 1表示爆冷
}
return pd.DataFrame(data)
def train_model(self):
"""训练预测模型"""
df = self.prepare_training_data()
X = df[self.features]
y = df['is_upset']
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 训练模型
self.model.fit(X_train, y_train)
# 评估模型
y_pred = self.model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.2f}")
return self.model
def predict_upset_probability(self, team1_rank, team2_rank,
is_home_team, historical_win_rate):
"""预测爆冷概率"""
if not hasattr(self.model, 'classes_'):
self.train_model()
# 准备输入特征
rank_diff = abs(team1_rank - team2_rank)
# 简单特征工程
features = np.array([[rank_diff, is_home_team, historical_win_rate]])
# 预测概率
proba = self.model.predict_proba(features)[0]
upset_prob = proba[1] if len(proba) > 1 else 0
return {
'rank_difference': rank_diff,
'upset_probability': upset_prob,
'prediction': '可能爆冷' if upset_prob > 0.5 else '不太可能爆冷'
}
使用示例:分析西班牙vs佛得角的比赛
predictor = UpsetPredictor()
prediction = predictor.predict_upset_probability(
team1_rank=8, # 西班牙实际排名
team2_rank=67, # 佛得角排名
is_home_team=0, #