我能够在下面的代码中将标题索引与标题文本索引进行匹配。我不明白的是,当标头不在汤中时附加np.NaN。这是我上一个问题的跟进。
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
urls_test = ['https://www.example.com/',
'https://www.example.com/']
engine = []
trans = []
color = []
interior = []
for url in urls_test:
soup = getAndParseURL(url)
ul = soup.select('ul[class="list-inline lot-breakdown-list"] li', recursive=True)
lis_e0 = []
lis_e1 = []
if ul:
for li in ul:
lis0 = []
lis1 = []
lis0.append(li.h5.contents[0])
lis1.append(li.contents[1])
lis_e0.extend(lis0)
lis_e1.extend(lis1)
try:
for i in range(min(len(lis_e1), len(lis_e0))):
if 'Engine' in lis_e0[i]:
engine.append(lis_e1[i])
except:
engine.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Trans' in x:
trans.append(lis_e1[i])
except:
trans.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Color' in x:
color.append(lis_e1[i])
except:
color.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Interior' in x:
interior.append(lis_e1[i])
except:
interior.append(np.NaN)
else:
engine.append(np.NaN)
trans.append(np.NaN)
color.append(np.NaN)
interior.append(np.NaN)
engine
trans
color
interior
print(str(len(engine)))
print(str(len(trans)))
print(str(len(color)))
print(str(len(interior)))
Out:
['383 CI']
['Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
1
1
2
2
在我正在寻找的输出下方(“引擎”的for循环不同,但应该工作相同)。长度必须与URL的数量相匹配,否则在抓取多个URL时,列表索引将与正确的URL不对应。感谢您抽出宝贵的时间!
['NaN', '383 CI']
['NaN', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
2
2
2
2
使用try..except块。
import requests
from bs4 import BeautifulSoup
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
urls_test = ['https://www.example.com/',
'https://www.example.com/']
engine = []
trans = []
color = []
interior = []
for url in urls_test:
soup = getAndParseURL(url)
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine')
engine.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine').next_element.next_element)
except:
engine.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans')
trans.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans').next_element.next_element)
except:
trans.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color')
color.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color').next_element.next_element)
except:
color.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior')
interior.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior').next_element.next_element)
except:
interior.append("Nan")
print(engine)
print(trans)
print(color)
print(interior)
输出:
['Nan', '383 CI']
['Nan', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
加载到DataFrame中。
df=pd.DataFrame({"Engine" : engine,"Trans" : trans,"Color" : color,"Interior":interior})
print(df)
输出:
Color Engine Interior Trans
0 Green Nan Black Nan
1 Curious Yellow 383 CI Black Automatic
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句