我是Web抓取的新手,还是Python的新手。我想在URL的论坛上抓取每个帖子的标题,因此,我将使用下面的标题之一创建一个新的帖子,我想接收带有该帖子链接的邮件。
通过搜索div structItem-title
i,我收到1页上的23个帖子。但是,当我要打印的每张发布的文字,我只收到的<class“海峡”>的print(type(first_result.text))
和的<class“bs4.element.Tag”>的print(type(first_result))
。
搜索标题
# Jeti_DS_16 = soup.find_all(text="Jeti DS 16")
# Jeti_DS_16_v2 = soup.find_all(text="Jeti DS 16 2")
# Jeti_DC_16 = soup.find_all(text="Jeti DC 16")
# Jeti_DC_16_v2 = soup.find_all(text="Jeti DC 16 2")
码
from requests import get
from bs4 import BeautifulSoup
import re
import smtplib
import time
import lxml
URL = 'https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
def checkForSearchItem():
response = get(URL)
# print(response.text[:500])
# page = requests.get(URL, headers=headers)
# page = requests.get(URL, headers=headers).text
# page = requests.get(URL).text
# page = requests.get(URL)
soup = BeautifulSoup(response.content, "lxml")
# soup = BeautifulSoup(page.content, "html.parser")
# soup = BeautifulSoup(page.text, "html.parser")
search_for_class = soup.find_all(
'div', class_='structItem-title')
# search_for_main = soup.find_all(
# 'div', class_="structItemContainer-group js-threadList")
# Jeti_DS_16 = soup.find_all(text="Jeti DS 16")
# Jeti_DS_16_v2 = soup.find_all(text="Jeti DS 16 2")
# Jeti_DC_16 = soup.find_all(text="Jeti DC 16")
# Jeti_DC_16_v2 = soup.find_all(text="Jeti DC 16 2")
# if(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2):
# send_mail()
# print('Die Nummer {0} {1} {2} {3} wurden gezogen'.format(
# Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2))
print(type(search_for_class))
print(len(search_for_class))
first_result = search_for_class[0]
# print(type(first_result.h3))
# print(type(first_result.div.a.text))
# print(type(first_result.a.text))
# print(type(first_result.p.text))
# print(type(first_result.name.text))
# print(type(first_result.title))
print(type(first_result))
print(type(first_result.text))
# print(soup.div)
# def send_mail():
# server_ssl = smtplib.SMTP_SSL('smtp.gmail.com', 465)
# server_ssl.ehlo()
# # server.starttls()
# # server.ehlo()
# server_ssl.login('[email protected]', 'SecurePassword')
# subject = 'Es gibt ein neuer Post im RC-Network auf deine gespeicherte Anfragen. Sieh in dir an{Link to Post}'
# body = 'Sieh es dir an Link: https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
# msg = f"Subject: {subject}\n\n{body}"
# emails = ["[email protected]"]
# server_ssl.sendmail(
# '[email protected]',
# emails,
# msg
# )
# print('e-Mail wurde versendet!')
# server_ssl.quit
while(True):
checkForSearchItem()
time.sleep(600)
# time.sleep(86400)
当您要打印文本时,不需要type()。type()函数只是查看变量具有哪种类型(int,str,...)。没有type()的代码对我来说很好,可以打印文本。这意味着,在打印语句上而不是这样:
print(type(first_result.text))
这样写:
print(first_result.text)
我希望那是您的问题所在,我可以为您提供帮助。当您需要帖子的URI时,必须在帖子div中获取一个Tag并从中提取您的URI,如下所示:
def checkForSearchItem():
response = get(URL)
soup = BeautifulSoup(response.content, "lxml")
posts = soup.find_all('div', class_='structItem-title')
for post in posts:
a_tag = post.find_all('a')[0] # The a-tag inside the div
link = a_tag.get('href') # The href inside the a-tag
url = f'https://www.rc-network.de{link}' # The full URI because the 'link' looks like /threads/sensoren-von-graupner.11835933/
print(post.text)
print(url)
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句