先把网页的内容(源码)获取到
然后通过正则在里面匹配存在集合里
import requests
import re
# 根据url获取网页html内容
def getHtmlContent(url):
page = requests.get(url)
content = page.text
return content
# 通过正则在html的源码中匹配
# 输出匹配的集合
def getKeys(htmlContent):
# url的正则
reg = re.compile(r'"cover_thumbkey":"(.+?)",')
#注:
#括号中的括号包围的内容是集合中的元素
# r'...'表示将...转义,相当于\
# 解析出key列表
keys = re.findall(reg, htmlContent)
return keys
url = 'http://image.so.com/z?ch=go'
content = getHtmlContent(url)
keys = getKeys(content)
for key in keys:
print(key+"\n")
结果:
t011fdff1cc0c3b5ddc.jpg
t01279fce5fe1f2724e.jpg
t01d51d0555bd71a76d.jpg
t011ed903a04d9cf633.jpg
t0130237d0b387f9c1e.jpg
t0165425a5c9db4e230.jpg
t01f0bed56899a8e3dc.jpg
t01561557b9f4a1c585.jpg
t01998b6d28e50fb6b5.jpg
t011c4860a95a36bd17.jpg
t01aa63d968ee65a5c3.jpg
t013b1d241effa05ab6.jpg
t01cac3fe72a340d3a9.jpg
t019ee6b26618740ea0.jpg
t01d887dd159577a87e.jpg
t0129c3dec9b29b98c7.jpg
t01d5e6034ec7672198.jpg
t01305c4605a919ef58.jpg
t01d8f3a130704bb822.jpg
t01e66a112c4b074bff.jpg
t01c3af0dd9ce5fed4f.jpg
t01c1778a8a1c098def.jpg
t01d25127cf4c4dfa86.jpg
t01dcd5dbf75ba6d9c8.jpg
t0101bc5934a0f24496.jpg
t0198ead75c49df97f4.jpg
t014cae84604d1faa82.jpg
t015ef9d0f2429a24ba.jpg
t017339b1343574c7a1.jpg
t01388041a45aee56e1.jpg