Python机器学习生态
Python是机器学习领域的主流语言,拥有丰富的框架和工具生态系统。
scikit-learn经典机器学习
核心特性
Python
from sklearn import *
# scikit-learn模块结构:
# sklearn.datasets - 数据集加载
# sklearn.preprocessing - 数据预处理
# sklearn.feature_extraction - 特征提取
# sklearn.model_selection - 模型选择
# sklearn.linear_model - 线性模型
# sklearn.tree - 决策树
# sklearn.ensemble - 集成方法
# sklearn.cluster - 聚类
# sklearn.metrics - 评估指标
# sklearn.pipeline - 流程管道
数据预处理
Python
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# MinMax归一化
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X)
分类模型
Python
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# 逻辑回归
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# 决策树
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)
# 随机森林
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
# SVM
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)
# 预测与评估
accuracy = clf.score(X_test, y_test)
回归模型
Python
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.coef_ # 系数
lr.intercept_ # 截距
# 岭回归(L2正则化)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
# 梯度提升回归
gbr = GradientBoostingRegressor(n_estimators=100)
gbr.fit(X_train, y_train)
模型评估
Python
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score
# 分类评估
accuracy_score(y_test, y_pred) # 准确率
precision_score(y_test, y_pred) # 精确率
recall_score(y_test, y_pred) # 召回率
f1_score(y_test, y_pred) # F1分数
confusion_matrix(y_test, y_pred) # 混淆矩阵
classification_report(y_test, y_pred) # 分类报告
# 回归评估
mean_squared_error(y_test, y_pred) # MSE
r2_score(y_test, y_pred) # R2系数
Pipeline管道
Python
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 组合预处理和模型
pipe = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression())
])
pipe.fit(X_train, y_train)
pipe.predict(X_test)
# 交叉验证
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipe, X, y, cv=5)
TensorFlow深度学习
TensorFlow核心
Python
import tensorflow as tf
# TensorFlow 2.x 使用Keras API
# 创建模型
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(10,)),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# 编译模型
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# 训练
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
# 评估
model.evaluate(X_test, y_test)
# 预测
predictions = model.predict(X_test)
自定义模型
Python
import tensorflow as tf
class CustomModel(tf.keras.Model):
def __init__(self):
super().__init__()
self.dense1 = tf.keras.layers.Dense(64, activation='relu')
self.dense2 = tf.keras.layers.Dense(32, activation='relu')
self.output_layer = tf.keras.layers.Dense(1)
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
return self.output_layer(x)
model = CustomModel()
model.compile(optimizer='adam', loss='mse')
卷积神经网络(CNN)
Python
import tensorflow as tf
# CNN模型
cnn = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
PyTorch深度学习
PyTorch核心
Python
import torch
import torch.nn as nn
import torch.optim as optim
# 张量操作
x = torch.tensor([1, 2, 3])
y = torch.randn(3, 4)
# GPU支持
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device)
# 定义模型
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(10, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
return self.fc3(x)
model = Net().to(device)
# 训练循环
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
for epoch in range(10):
for batch_x, batch_y in dataloader:
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
PyTorch DataLoader
Python
import torch
from torch.utils.data import DataLoader, TensorDataset
# 创建数据集
dataset = TensorDataset(X_tensor, y_tensor)
# DataLoader
loader = DataLoader(dataset, batch_size=32, shuffle=True)
# 遍历数据
for batch_x, batch_y in loader:
pass
PyTorch CNN
Python
import torch.nn as nn
class CNN(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
self.fc1 = nn.Linear(64 * 5 * 5, 64)
self.fc2 = nn.Linear(64, 10)
def forward(self, x):
x = self.pool(torch.relu(self.conv1(x)))
x = self.pool(torch.relu(self.conv2(x)))
x = x.view(-1, 64 * 5 * 5)
x = torch.relu(self.fc1(x))
return self.fc2(x)
框架对比
| 特性 | scikit-learn | TensorFlow | PyTorch |
|---|---|---|---|
| 类型 | 经典ML | 深度学习 | 深度学习 |
| API风格 | 统一API | Keras高层API | 动态图 |
| 适用场景 | 传统ML算法 | 生产部署 | 研究、灵活开发 |
| GPU支持 | 有限 | 原生支持 | 原生支持 |
| 学习曲线 | 简单 | 中等 | 较难 |
| 生产部署 | 需配合 | TF Serving完整 | TorchServe |
选择指南
Python
def choose_framework():
"框架选择决策"
# 经典机器学习(分类、回归、聚类)
if task_type in ['classification', 'regression', 'clustering']:
return 'scikit-learn'
# 深度学习,需要灵活研究和实验
if requires == 'flexibility':
return 'PyTorch'
# 深度学习,需要生产部署和大规模应用
if requires == 'production':
return 'TensorFlow'
# 计算机视觉、NLP等复杂深度学习
if domain in ['vision', 'nlp']:
return 'PyTorch or TensorFlow'
安装与版本
Bash
# 安装框架
pip install scikit-learn
pip install tensorflow
pip install torch torchvision
# GPU版本
pip install tensorflow-gpu
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
# 版本检查
import sklearn
import tensorflow as tf
import torch
print(sklearn.__version__) # 1.4.x
print(tf.__version__) # 2.x
print(torch.__version__) # 2.x
注意:深度学习框架版本更新频繁,建议使用稳定版本,注意CUDA版本匹配。
要点总结
- scikit-learn:经典ML统一API、预处理+模型+评估完整流程、Pipeline管道化
- TensorFlow:Keras高层API、生产部署完善、适合企业应用
- PyTorch:动态图灵活、研究友好、GPU原生支持、社区活跃
- 框架选择:经典ML用sklearn,深度学习研究用PyTorch,生产用TensorFlow
存放路径:articles/PYTHON/专家/生态与工具链/机器学习生态.md
📝 发现内容有误?点击此处直接编辑