Python URL处理与解析
urllib.parse 模块处理 URL 解析、编码和构建。
urlparse 解析 URL
Python
from urllib.parse import urlparse
# 解析 URL 组件
url = 'https://example.com:8080/path?key=value#section'
result = urlparse(url)
print(result.scheme) # https
print(result.netloc) # example.com:8080
print(result.path) # /path
print(result.params) # (查询前参数)
print(result.query) # key=value
print(result.fragment) # section
print(result) # ParseResult 对象
urlunparse 构建 URL
Python
from urllib.parse import urlunparse
# 从组件构建 URL
components = ('https', 'example.com', '/path', '', 'key=value', 'section')
url = urlunparse(components)
print(url) # https://example.com/path?key=value#section
urljoin 拼接 URL
Python
from urllib.parse import urljoin
# 相对 URL 拼接
base = 'https://example.com/path/page.html'
# 相对路径
print(urljoin(base, 'other.html'))
# https://example.com/path/other.html
print(urljoin(base, '../up.html'))
# https://example.com/up.html
print(urljoin(base, '/absolute'))
# https://example.com/absolute
print(urljoin(base, 'https://other.com'))
# https://other.com(完整 URL 覆盖)
urlencode 编码参数
Python
from urllib.parse import urlencode
# 字典编码为查询字符串
params = {'key': 'value', 'name': 'Alice', 'age': 25}
query = urlencode(params)
print(query) # key=value&name=Alice&age=25
# 列表编码(多值参数)
params = [('key', 'value1'), ('key', 'value2')]
query = urlencode(params)
print(query) # key=value1&key=value2
# 自定义分隔符
query = urlencode(params, doseq=True)
parse_qs 解析查询字符串
Python
from urllib.parse import parse_qs, parse_qsl
# 解析查询字符串为字典
query = 'key=value&name=Alice&list=1&list=2'
result = parse_qs(query)
print(result) # {'key': ['value'], 'name': ['Alice'], 'list': ['1', '2']}
# 解析为列表
result = parse_qsl(query)
print(result) # [('key', 'value'), ('name', 'Alice'), ('list', '1'), ('list', '2')]
quote 编码特殊字符
Python
from urllib.parse import quote, quote_plus
# 编码 URL 中的特殊字符
text = 'hello world/special'
encoded = quote(text)
print(encoded) # hello%20world%2Fspecial
# quote_plus 编码空格为 +
encoded = quote_plus(text)
print(encoded) # hello+world%2Fspecial
# safe 参数指定不编码的字符
encoded = quote(text, safe='/')
print(encoded) # hello%20world/special
unquote 解码
Python
from urllib.parse import unquote, unquote_plus
# 解码 URL 编码
encoded = 'hello%20world%2Fspecial'
text = unquote(encoded)
print(text) # hello world/special
# unquote_plus 解码 +
encoded = 'hello+world%2Fspecial'
text = unquote_plus(encoded)
print(text) # hello world/special
综合示例
Python
from urllib.parse import urlparse, urlencode, urlunparse
def build_url(base_url, path, params=None):
"构建完整 URL"
parsed = urlparse(base_url)
# 添加路径
new_path = parsed.path + path
# 编码参数
query = urlencode(params) if params else parsed.query
# 构建 URL
new_url = urlunparse((
parsed.scheme,
parsed.netloc,
new_path,
parsed.params,
query,
parsed.fragment
))
return new_url
url = build_url('https://api.example.com/v1', '/users', {'id': 123})
print(url) # https://api.example.com/v1/users?id=123
URL 组件提取
Python
from urllib.parse import urlparse
url = 'https://user:pass@example.com:8080/path?query=1#frag'
result = urlparse(url)
# 提取用户名密码
from urllib.parse import parse_qs
print(result.username) # user
print(result.password) # pass
# 提取主机和端口
print(result.hostname) # example.com
print(result.port) # 8080
函数汇总
| 函数 | 用途 |
|---|---|
| urlparse | 解析 URL |
| urlunparse | 构建 URL |
| urljoin | 拼接 URL |
| urlencode | 编码查询参数 |
| parse_qs | 解析查询字符串(字典) |
| parse_qsl | 解析查询字符串(列表) |
| quote | 编码特殊字符 |
| unquote | 解码特殊字符 |
要点总结
urlparse()解析 URL 为各组件urlunparse()从组件构建 URLurljoin()处理相对 URL 拼接urlencode()编码字典为查询字符串parse_qs()解析查询字符串quote()编码特殊字符为 %XXquote_plus()编码空格为 +unquote()解码 URL 编码字符串- urllib.parse 是 URL 处理的标准工具
📝 发现内容有误?点击此处直接编辑