全部学科
Python全栈
python
NodeJS全栈
nodejs
小程序首页
📅 2026-05-19 8 分钟 ✍️ juanwangdev

Python机器学习生态

Python是机器学习领域的主流语言,拥有丰富的框架和工具生态系统。

scikit-learn经典机器学习

核心特性

Python
from sklearn import *

# scikit-learn模块结构:
# sklearn.datasets    - 数据集加载
# sklearn.preprocessing - 数据预处理
# sklearn.feature_extraction - 特征提取
# sklearn.model_selection - 模型选择
# sklearn.linear_model - 线性模型
# sklearn.tree       - 决策树
# sklearn.ensemble   - 集成方法
# sklearn.cluster    - 聚类
# sklearn.metrics    - 评估指标
# sklearn.pipeline   - 流程管道

数据预处理

Python
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# MinMax归一化
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X)

分类模型

Python
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# 逻辑回归
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 决策树
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)

# 随机森林
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

# SVM
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)

# 预测与评估
accuracy = clf.score(X_test, y_test)

回归模型

Python
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.coef_          # 系数
lr.intercept_     # 截距

# 岭回归(L2正则化)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# 梯度提升回归
gbr = GradientBoostingRegressor(n_estimators=100)
gbr.fit(X_train, y_train)

模型评估

Python
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score

# 分类评估
accuracy_score(y_test, y_pred)       # 准确率
precision_score(y_test, y_pred)      # 精确率
recall_score(y_test, y_pred)         # 召回率
f1_score(y_test, y_pred)             # F1分数
confusion_matrix(y_test, y_pred)     # 混淆矩阵
classification_report(y_test, y_pred) # 分类报告

# 回归评估
mean_squared_error(y_test, y_pred)   # MSE
r2_score(y_test, y_pred)             # R2系数

Pipeline管道

Python
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# 组合预处理和模型
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.predict(X_test)

# 交叉验证
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipe, X, y, cv=5)

TensorFlow深度学习

TensorFlow核心

Python
import tensorflow as tf

# TensorFlow 2.x 使用Keras API

# 创建模型
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(10,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 编译模型
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# 训练
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# 评估
model.evaluate(X_test, y_test)

# 预测
predictions = model.predict(X_test)

自定义模型

Python
import tensorflow as tf

class CustomModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.output_layer = tf.keras.layers.Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

model = CustomModel()
model.compile(optimizer='adam', loss='mse')

卷积神经网络(CNN)

Python
import tensorflow as tf

# CNN模型
cnn = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

PyTorch深度学习

PyTorch核心

Python
import torch
import torch.nn as nn
import torch.optim as optim

# 张量操作
x = torch.tensor([1, 2, 3])
y = torch.randn(3, 4)

# GPU支持
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device)

# 定义模型
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

model = Net().to(device)

# 训练循环
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

for epoch in range(10):
    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

PyTorch DataLoader

Python
import torch
from torch.utils.data import DataLoader, TensorDataset

# 创建数据集
dataset = TensorDataset(X_tensor, y_tensor)

# DataLoader
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# 遍历数据
for batch_x, batch_y in loader:
    pass

PyTorch CNN

Python
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.fc1 = nn.Linear(64 * 5 * 5, 64)
        self.fc2 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 64 * 5 * 5)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

框架对比

特性scikit-learnTensorFlowPyTorch
类型经典ML深度学习深度学习
API风格统一APIKeras高层API动态图
适用场景传统ML算法生产部署研究、灵活开发
GPU支持有限原生支持原生支持
学习曲线简单中等较难
生产部署需配合TF Serving完整TorchServe

选择指南

Python
def choose_framework():
    "框架选择决策"

    # 经典机器学习(分类、回归、聚类)
    if task_type in ['classification', 'regression', 'clustering']:
        return 'scikit-learn'

    # 深度学习,需要灵活研究和实验
    if requires == 'flexibility':
        return 'PyTorch'

    # 深度学习,需要生产部署和大规模应用
    if requires == 'production':
        return 'TensorFlow'

    # 计算机视觉、NLP等复杂深度学习
    if domain in ['vision', 'nlp']:
        return 'PyTorch or TensorFlow'

安装与版本

Bash
# 安装框架
pip install scikit-learn
pip install tensorflow
pip install torch torchvision

# GPU版本
pip install tensorflow-gpu
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118

# 版本检查
import sklearn
import tensorflow as tf
import torch

print(sklearn.__version__)   # 1.4.x
print(tf.__version__)        # 2.x
print(torch.__version__)     # 2.x

注意:深度学习框架版本更新频繁,建议使用稳定版本,注意CUDA版本匹配。

要点总结

  • scikit-learn:经典ML统一API、预处理+模型+评估完整流程、Pipeline管道化
  • TensorFlow:Keras高层API、生产部署完善、适合企业应用
  • PyTorch:动态图灵活、研究友好、GPU原生支持、社区活跃
  • 框架选择:经典ML用sklearn,深度学习研究用PyTorch,生产用TensorFlow

存放路径articles/PYTHON/专家/生态与工具链/机器学习生态.md

📝 发现内容有误?点击此处直接编辑

← 上一篇 Python数据科学工具概览
下一篇 → Python虚拟环境管理
想查看更多题目和详细解析?
小程序提供完整的题库、模拟考试和详细解析
马上就来

长按或扫描二维码,立即体验

扫码体验小程序
马上就来
使用微信扫描二维码
立即体验完整题库