I have this code:
import pyperclip
import requests
from bs4 import BeautifulSoup
base_url = "https://www.bbc.com"
url = base_url + "/news/world"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('div', class_='gs-c-promo-body')
text = ''
for article in articles:
headline = article.find('h3', class_='gs-c-promo-heading__title')
if headline:
text += headline.text + '\n'
summary = article.find('p', class_='gs-c-promo-summary')
if summary:
text += summary.text + '\n'
link = article.find('a', class_='gs-c-promo-heading')
if link:
href = link['href']
if href.startswith('//'):
article_url = 'https:' + href
else:
article_url = base_url + href
article_response = requests.get(article_url)
article_soup = BeautifulSoup(article_response.text, 'html.parser')
article_text = article_soup.find('div', class_='story-body__inner')
if article_text:
text += article_text.get_text() + '\n\n'
pyperclip.copy(text)
The code I gave above is for example, let's say that I need to copy the text of each headline and the contents inside the headline. So I want to make a Python code that automatically goes to a website and then reproduces the head line, creates an empty line, and then creates another line with the contents given inside the headline.
Traceback (most recent call last):
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connection.py", line 200, in _new_conn
sock = connection.create_connection(
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\util\connection.py", line 60, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Users\msala\AppData\Local\Programs\Python\Python39\lib\socket.py", line 954, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connectionpool.py", line 790, in urlopen
response = self._make_request(
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connectionpool.py", line 491, in _make_request
raise new_e
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connectionpool.py", line 467, in _make_request
self._validate_conn(conn)
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connectionpool.py", line 1092, in _validate_conn
conn.connect()
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connection.py", line 604, in connect
self.sock = sock = self._new_conn()
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connection.py", line 207, in _new_conn
raise NameResolutionError(self.host, self, e) from e
urllib3.exceptions.NameResolutionError: <urllib3.connection.HTTPSConnection object at 0x000001AF8CB443D0>: Failed to resolve 'www.bbc.comhttps' ([Errno 11001] getaddrinfo failed)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\requests\adapters.py", line 486, in send
resp = conn.urlopen(
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\connectionpool.py", line 844, in urlopen
retries = retries.increment(
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\urllib3\util\retry.py", line 515, in increment
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.bbc.comhttps', port=443): Max retries exceeded with url: //www.bbc.com/future/article/20230512-eurovision-why-some-countries-vote-for-each-other (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AF8CB443D0>: Failed to resolve 'www.bbc.comhttps' ([Errno 11001] getaddrinfo failed)"))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\msala\PycharmProjects\pythonProject1\main.py", line 26, in <module>
article_response = requests.get(article_url)
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\requests\api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\requests\api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\requests\sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\requests\sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "C:\Users\msala\PycharmProjects\learnPython\venv\pythonProject1\lib\site-packages\requests\adapters.py", line 519, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='www.bbc.comhttps', port=443): Max retries exceeded with url: //www.bbc.com/future/article/20230512-eurovision-why-some-countries-vote-for-each-other (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AF8CB443D0>: Failed to resolve 'www.bbc.comhttps' ([Errno 11001] getaddrinfo failed)"))
Process finished with exit code 1
I have tried multiple times to fix the code but
You try to fetch the URL
which is clearly wrong. It is caused by the following statement:
You should not prefix an already absolute URL. Check if
hrefis already an URL you can directly fetch. You can use validators package or write your own logic.