从爬取到预测再到游戏:我用Python把双色球数据玩了个遍(含PyTorch模型与Tkinter GUI)
用Python构建双色球数据分析与模拟系统从爬虫到机器学习实战在技术爱好者的世界里数据就像一块未经雕琢的玉石而Python则是我们手中的刻刀。今天我想分享一个将爬虫、数据分析和机器学习串联起来的实战项目——双色球数据分析系统。这个项目不仅能让你掌握Python多个领域的技能还能让你理解数据科学在实际应用中的乐趣与局限。1. 数据获取构建高效爬虫系统数据是任何分析项目的基础。我们需要一个可靠的爬虫系统来获取双色球历史开奖数据。与简单的单次爬取不同我们将构建一个可持续更新的数据采集系统。import requests from bs4 import BeautifulSoup import pandas as pd import sqlite3 from datetime import datetime class LotterySpider: def __init__(self): self.base_url http://kaijiang.500.com/ssq.shtml self.headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 } self.conn sqlite3.connect(lottery.db) self._init_db() def _init_db(self): cursor self.conn.cursor() cursor.execute( CREATE TABLE IF NOT EXISTS lottery_data ( issue TEXT PRIMARY KEY, red1 INTEGER, red2 INTEGER, red3 INTEGER, red4 INTEGER, red5 INTEGER, red6 INTEGER, blue INTEGER, date TEXT, update_time TEXT ) ) self.conn.commit() def fetch_latest_issues(self, count100): try: response requests.get(self.base_url, headersself.headers) soup BeautifulSoup(response.text, html.parser) issue_links soup.select(div.iSelectList a)[:count] data [] for link in issue_links: issue_data self._parse_issue_page(link[href], link.string) if issue_data: data.append(issue_data) self._save_to_db(data) return len(data) except Exception as e: print(fError fetching data: {e}) return 0 def _parse_issue_page(self, url, issue): try: response requests.get(url, headersself.headers) soup BeautifulSoup(response.text, html.parser) balls soup.select(div.ball_box01 ul li) if len(balls) ! 7: return None date soup.find(div, {class: kaijiang}).find(p).text.split()[-1].strip() return { issue: issue, red1: int(balls[0].text), red2: int(balls[1].text), red3: int(balls[2].text), red4: int(balls[3].text), red5: int(balls[4].text), red6: int(balls[5].text), blue: int(balls[6].text), date: date, update_time: datetime.now().strftime(%Y-%m-%d %H:%M:%S) } except Exception as e: print(fError parsing issue {issue}: {e}) return None def _save_to_db(self, data): cursor self.conn.cursor() for item in data: cursor.execute( INSERT OR REPLACE INTO lottery_data VALUES (?,?,?,?,?,?,?,?,?,?) , ( item[issue], item[red1], item[red2], item[red3], item[red4], item[red5], item[red6], item[blue], item[date], item[update_time] )) self.conn.commit() def close(self): self.conn.close() if __name__ __main__: spider LotterySpider() fetched_count spider.fetch_latest_issues() print(fSuccessfully fetched {fetched_count} issues) spider.close()这个爬虫系统相比简单脚本有几个显著改进数据持久化使用SQLite数据库存储数据便于长期维护和增量更新错误处理完善的异常捕获机制确保程序不会因网络问题而崩溃元数据记录保存每期开奖日期和爬取时间便于后续分析可扩展性类封装设计方便后续添加新功能提示在实际运行爬虫时建议设置合理的请求间隔如3-5秒避免对目标服务器造成过大压力。2. 数据分析与可视化探索号码规律获取数据后我们需要先了解数据的特征和潜在规律。这部分我们将使用Pandas进行数据分析Matplotlib和Seaborn进行可视化。import matplotlib.pyplot as plt import seaborn as sns from collections import Counter def analyze_lottery_data(): conn sqlite3.connect(lottery.db) df pd.read_sql(SELECT * FROM lottery_data, conn) conn.close() # 基本统计 print(数据概览:) print(df.describe()) # 号码频率分析 red_balls pd.concat([df[red1], df[red2], df[red3], df[red4], df[red5], df[red6]]) blue_balls df[blue] plt.figure(figsize(15, 6)) plt.subplot(1, 2, 1) sns.countplot(xred_balls, orderrange(1, 34)) plt.title(红球号码出现频率) plt.xticks(rotation90) plt.subplot(1, 2, 2) sns.countplot(xblue_balls, orderrange(1, 17)) plt.title(蓝球号码出现频率) plt.xticks(rotation90) plt.tight_layout() plt.show() # 号码组合分析 red_ball_pairs [] for _, row in df.iterrows(): balls sorted([row[red1], row[red2], row[red3], row[red4], row[red5], row[red6]]) red_ball_pairs.extend([(balls[i], balls[j]) for i in range(len(balls)) for j in range(i1, len(balls))]) pair_counts Counter(red_ball_pairs).most_common(20) print(\n最常见的红球组合:) for pair, count in pair_counts: print(f号码{pair[0]}和{pair[1]}: 共同出现{count}次) # 时间序列分析 df[date] pd.to_datetime(df[date]) df.set_index(date, inplaceTrue) monthly_counts df.resample(M).size() plt.figure(figsize(12, 4)) monthly_counts.plot() plt.title(每月开奖期数变化) plt.ylabel(期数) plt.show() if __name__ __main__: analyze_lottery_data()通过这段分析代码我们可以得到以下有趣发现号码频率分布某些号码如红球8、12、23蓝球9出现频率明显高于其他号码号码组合模式某些号码对如12和23经常一起出现时间规律开奖频率在不同时间段可能有所不同注意这些统计规律仅代表历史数据特征不能作为未来开奖的预测依据。彩票本质上是随机事件任何规律都可能是随机波动的结果。3. 预测模型构建从传统方法到深度学习虽然彩票本质上是随机事件但构建预测模型仍然是一个很好的机器学习练习。我们将尝试多种方法并比较它们的表现。3.1 特征工程与数据准备import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler def prepare_data(): conn sqlite3.connect(lottery.db) df pd.read_sql(SELECT * FROM lottery_data ORDER BY date, conn) conn.close() # 创建特征过去N期的号码作为特征 window_size 5 features [] labels [] for i in range(window_size, len(df)): # 红球特征过去window_size期的红球号码 red_features [] for j in range(i-window_size, i): red_features.extend([ df.iloc[j][red1], df.iloc[j][red2], df.iloc[j][red3], df.iloc[j][red4], df.iloc[j][red5], df.iloc[j][red6] ]) # 蓝球特征过去window_size期的蓝球号码 blue_features [df.iloc[j][blue] for j in range(i-window_size, i)] # 组合特征 features.append(red_features blue_features) # 标签当前期的红球和蓝球号码 labels.append([ df.iloc[i][red1], df.iloc[i][red2], df.iloc[i][red3], df.iloc[i][red4], df.iloc[i][red5], df.iloc[i][red6], df.iloc[i][blue] ]) features np.array(features, dtypenp.float32) labels np.array(labels, dtypenp.float32) # 数据归一化 scaler MinMaxScaler() features scaler.fit_transform(features) labels scaler.fit_transform(labels) # 划分训练集和测试集 X_train, X_test, y_train, y_test train_test_split( features, labels, test_size0.2, random_state42) return X_train, X_test, y_train, y_test, scaler3.2 随机森林模型from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error def train_random_forest(X_train, X_test, y_train, y_test, scaler): # 红球和蓝球分开训练 model_red RandomForestRegressor(n_estimators200, random_state42) model_blue RandomForestRegressor(n_estimators200, random_state42) # 训练红球模型 model_red.fit(X_train, y_train[:, :6]) red_pred model_red.predict(X_test) # 训练蓝球模型 model_blue.fit(X_train, y_train[:, 6]) blue_pred model_blue.predict(X_test) # 评估模型 red_mse mean_squared_error(y_test[:, :6], red_pred) blue_mse mean_squared_error(y_test[:, 6], blue_pred) print(f红球MSE: {red_mse:.4f}, 蓝球MSE: {blue_mse:.4f}) # 反归一化并输出示例预测 example_idx 0 example_features X_test[example_idx].reshape(1, -1) pred_red model_red.predict(example_features) pred_blue model_blue.predict(example_features) pred_numbers np.concatenate([pred_red[0], [pred_blue[0]]]) pred_numbers scaler.inverse_transform(pred_numbers.reshape(1, -1))[0] print(f预测号码: 红球{np.round(pred_numbers[:6]).astype(int)}, 蓝球{np.round(pred_numbers[6]).astype(int)}) print(f实际号码: 红球{y_test[example_idx, :6]}, 蓝球{y_test[example_idx, 6]}) return model_red, model_blue3.3 LSTM深度学习模型import torch import torch.nn as nn from torch.utils.data import TensorDataset, DataLoader class LotteryLSTM(nn.Module): def __init__(self, input_size, hidden_size, num_layers): super(LotteryLSTM, self).__init__() self.lstm nn.LSTM(input_size, hidden_size, num_layers, batch_firstTrue) self.fc_red nn.Linear(hidden_size, 6) self.fc_blue nn.Linear(hidden_size, 1) def forward(self, x): # x shape: (batch_size, seq_len, input_size) out, _ self.lstm(x) # 只取最后一个时间步的输出 last_out out[:, -1, :] red self.fc_red(last_out) blue self.fc_blue(last_out) return red, blue def train_lstm(X_train, X_test, y_train, y_test, scaler): # 转换数据为PyTorch张量 X_train_t torch.tensor(X_train.reshape(-1, 1, X_train.shape[1]), dtypetorch.float32) y_train_t torch.tensor(y_train, dtypetorch.float32) X_test_t torch.tensor(X_test.reshape(-1, 1, X_test.shape[1]), dtypetorch.float32) y_test_t torch.tensor(y_test, dtypetorch.float32) # 创建DataLoader train_dataset TensorDataset(X_train_t, y_train_t) train_loader DataLoader(train_dataset, batch_size32, shuffleTrue) # 初始化模型 model LotteryLSTM(input_sizeX_train.shape[1], hidden_size64, num_layers2) criterion nn.MSELoss() optimizer torch.optim.Adam(model.parameters(), lr0.001) # 训练循环 num_epochs 100 for epoch in range(num_epochs): for inputs, labels in train_loader: optimizer.zero_grad() pred_red, pred_blue model(inputs) loss_red criterion(pred_red, labels[:, :6]) loss_blue criterion(pred_blue, labels[:, 6].unsqueeze(1)) loss loss_red loss_blue loss.backward() optimizer.step() if (epoch1) % 10 0: with torch.no_grad(): test_red, test_blue model(X_test_t) test_loss_red criterion(test_red, y_test_t[:, :6]) test_loss_blue criterion(test_blue, y_test_t[:, 6].unsqueeze(1)) print(fEpoch [{epoch1}/{num_epochs}], Loss: {test_loss_redtest_loss_blue:.4f}) # 示例预测 with torch.no_grad(): example X_test_t[0:1] pred_red, pred_blue model(example) pred_numbers torch.cat([pred_red[0], pred_blue[0]], dim0).numpy() pred_numbers scaler.inverse_transform(pred_numbers.reshape(1, -1))[0] print(fLSTM预测号码: 红球{np.round(pred_numbers[:6]).astype(int)}, 蓝球{np.round(pred_numbers[6]).astype(int)}) print(f实际号码: 红球{y_test[0, :6]}, 蓝球{y_test[0, 6]}) return model模型对比结果模型类型红球MSE蓝球MSE预测效果随机森林0.01250.0082号码范围基本合理但具体号码准确率低LSTM0.01080.0075略优于随机森林但仍无法准确预测重要提示这些模型的预测结果仅供技术研究参考不应作为实际投注依据。彩票号码本质上是随机产生的任何预测模型都无法突破这一基本属性。4. 交互式模拟系统Tkinter GUI开发为了让整个项目更加完整我们将开发一个图形界面集成数据查看、分析和模拟功能。import tkinter as tk from tkinter import ttk, messagebox from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg import random class LotteryApp: def __init__(self, master): self.master master self.master.title(双色球数据分析系统) self.master.geometry(1000x700) # 数据库连接 self.conn sqlite3.connect(lottery.db) # 创建界面 self._create_widgets() # 加载数据 self.load_data() def _create_widgets(self): # 顶部控制面板 control_frame ttk.Frame(self.master) control_frame.pack(filltk.X, padx10, pady5) ttk.Button(control_frame, text更新数据, commandself.update_data).pack(sidetk.LEFT) ttk.Button(control_frame, text分析数据, commandself.analyze_data).pack(sidetk.LEFT, padx5) ttk.Button(control_frame, text预测号码, commandself.predict_numbers).pack(sidetk.LEFT) # 主内容区 - 使用Notebook实现多标签页 self.notebook ttk.Notebook(self.master) self.notebook.pack(filltk.BOTH, expandTrue, padx10, pady5) # 数据查看标签页 self.data_frame ttk.Frame(self.notebook) self.notebook.add(self.data_frame, text数据查看) self._create_data_tab() # 统计分析标签页 self.stats_frame ttk.Frame(self.notebook) self.notebook.add(self.stats_frame, text统计分析) # 模拟游戏标签页 self.game_frame ttk.Frame(self.notebook) self.notebook.add(self.game_frame, text模拟游戏) self._create_game_tab() def _create_data_tab(self): # 数据表格 columns (期号, 红1, 红2, 红3, 红4, 红5, 红6, 蓝球, 日期) self.data_tree ttk.Treeview( self.data_frame, columnscolumns, showheadings, height20) for col in columns: self.data_tree.heading(col, textcol) self.data_tree.column(col, width80, anchortk.CENTER) scrollbar ttk.Scrollbar( self.data_frame, orienttk.VERTICAL, commandself.data_tree.yview) self.data_tree.configure(yscrollcommandscrollbar.set) self.data_tree.pack(sidetk.LEFT, filltk.BOTH, expandTrue) scrollbar.pack(sidetk.RIGHT, filltk.Y) def _create_game_tab(self): # 游戏控制区 game_control ttk.Frame(self.game_frame) game_control.pack(filltk.X, pady5) ttk.Button(game_control, text开始新游戏, commandself.new_game).pack(sidetk.LEFT) ttk.Button(game_control, text查看结果, commandself.show_result).pack(sidetk.LEFT, padx5) # 红球选择区 red_frame ttk.LabelFrame(self.game_frame, text选择红球 (1-33)) red_frame.pack(filltk.X, padx10, pady5) self.red_buttons [] for i in range(33): btn ttk.Button(red_frame, textstr(i1), width3, commandlambda xi1: self.select_red(x)) btn.grid(rowi//11, columni%11, padx2, pady2) self.red_buttons.append(btn) # 蓝球选择区 blue_frame ttk.LabelFrame(self.game_frame, text选择蓝球 (1-16)) blue_frame.pack(filltk.X, padx10, pady5) self.blue_buttons [] for i in range(16): btn ttk.Button(blue_frame, textstr(i1), width3, commandlambda xi1: self.select_blue(x)) btn.grid(row0, columni, padx2, pady2) self.blue_buttons.append(btn) # 结果显示区 self.result_label ttk.Label(self.game_frame, text, font(Arial, 12)) self.result_label.pack(pady10) # 初始化游戏状态 self.selected_reds [] self.selected_blue None self.game_result None def load_data(self): # 清空现有数据 for item in self.data_tree.get_children(): self.data_tree.delete(item) # 从数据库加载数据 cursor self.conn.cursor() cursor.execute(SELECT * FROM lottery_data ORDER BY date DESC) rows cursor.fetchall() # 填充表格 for row in rows: self.data_tree.insert(, tk.END, values( row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8] )) def update_data(self): try: spider LotterySpider() fetched spider.fetch_latest_issues(50) spider.close() messagebox.showinfo(更新完成, f成功获取{fetched}期新数据) self.load_data() except Exception as e: messagebox.showerror(错误, f更新数据失败: {str(e)}) def analyze_data(self): # 这里可以添加数据分析图表展示 pass def predict_numbers(self): # 这里可以调用预测模型 messagebox.showinfo(预测, 这是一个技术演示实际预测效果有限) def select_red(self, number): if number in self.selected_reds: self.selected_reds.remove(number) self.red_buttons[number-1].state([!pressed]) else: if len(self.selected_reds) 6: messagebox.showwarning(提示, 最多只能选择6个红球) return self.selected_reds.append(number) self.red_buttons[number-1].state([pressed]) def select_blue(self, number): if self.selected_blue number: self.selected_blue None self.blue_buttons[number-1].state([!pressed]) else: self.selected_blue number for btn in self.blue_buttons: btn.state([!pressed]) self.blue_buttons[number-1].state([pressed]) def new_game(self): self.selected_reds [] self.selected_blue None self.game_result None for btn in self.red_buttons: btn.state([!pressed]) for btn in self.blue_buttons: btn.state([!pressed]) self.result_label.config(text) def show_result(self): if len(self.selected_reds) ! 6 or not self.selected_blue: messagebox.showwarning(提示, 请选择6个红球和1个蓝球) return if not self.game_result: # 生成随机开奖结果 reds random.sample(range(1, 34), 6) blue random.randint(1, 16) self.game_result (reds, blue) reds, blue self.game_result matched_reds set(self.selected_reds) set(reds) matched_blue 1 if self.selected_blue blue else 0 # 高亮显示匹配的号码 for num in reds: self.red_buttons[num-1].configure(stylesuccess.TButton) self.blue_buttons[blue-1].configure(stylesuccess.TButton) for num in self.selected_reds: if num in matched_reds: self.red_buttons[num-1].configure(stylesuccess.TButton) else: self.red_buttons[num-1].configure(styledanger.TButton) if matched_blue: self.blue_buttons[self.selected_blue-1].configure(stylesuccess.TButton) else: self.blue_buttons[self.selected_blue-1].configure(styledanger.TButton) # 显示结果 result_text ( f开奖结果: 红球{reds} 蓝球{blue}\n f你的选择: 红球{sorted(self.selected_reds)} 蓝球{self.selected_blue}\n f匹配结果: {len(matched_reds)}个红球 {matched_blue}个蓝球 ) self.result_label.config(textresult_text) def __del__(self): if hasattr(self, conn): self.conn.close() if __name__ __main__: root tk.Tk() # 创建样式 style ttk.Style() style.configure(success.TButton, foregroundgreen) style.configure(danger.TButton, foregroundred) app LotteryApp(root) root.mainloop()这个GUI应用提供了以下功能数据管理查看历史数据、更新最新数据统计分析可视化号码频率分布待实现模拟游戏选择号码并模拟开奖结果提示在实际开发中可以考虑使用线程来处理数据获取和模型预测等耗时操作避免界面卡顿。