、| 、 | 、、、…
1.3核心代码:
1
2
|
res_tr = r'<tr>(.*?)</tr>'
m_tr = re.findall(res_tr,language,re.S|re.M)
|
1.4例子:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
import re
language = '''<tr><th>床前明月光</th><td>忧思独伤心</td></tr><tr>'''
# 正则表达式获取<tr></tr>之间内容
res_tr = r"<tr>(.*?)</tr>"
m_tr = re.findall(res_tr,language,re.S|re.M)
print (unicode(m_tr,"utf-8"))
for line in m_tr:
print line
res_th = r"<th>(.*?)</th>"
m_th = re.findall(res_th,line,re.S|re.M)
for mm in m_th:
print (unicode(mm,"utf-8"))
res_td = r"<td>(.*?)</td>"
m_td = re.findall(res_td,line,re.S|re.M)
for nn in m_td:
print (unicode(nn,"utf-8"))
|
2.获取超链接之间内容
2.1在使用正则表达式时,需要分析网页链接,获取URL或网页内容。核心代码如下:
1
2
3
|
res = r'<a .*?>(.*?)</a>'
mm = re.findall(res, content, re.S|re.M)
urls=re.findall(r"<a.*?href=.*?<\/a>", content, re.I|re.S|re.M)
|
2.1.1例子:
1
2
3
4
5
6
7
|
import re
content = '''
<td>
<a href="https://www.baidu.com/articles/zj.html" title="浙江省">浙江省主题介绍</a>
<a href="https://www.baidu.com//articles/gz.html" title="贵州省">贵州省主题介绍</a>
</td>
'''
|
2.1.2获取之间的内容
1
2
3
4
|
res = r'<a .*?>(.*?)</a>'
mm = re.findall(res,content,re.S|re.M)
for value in mm:
print (value)
|
2.1.3获取所有链接所有内容
1
2
3
|
urls = re.findall(r"a.*?href=.*?<\/a>",content,re.I|re.S|re.M)
for i in urls:
print i
|
2.1.4获取中的URL
1
2
3
4
|
res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
link = re.findall(res_url,content,re.I|re.S|re.M)
for url in link:
print (url)
|
微信赞赏
支付宝赞赏
|