I am trying to scrape a specific portion of a website's html to get some text from a "bio" section. When I run the launcher it shuts down upon loading the extension this is happening in and returns this traceback:
Traceback (most recent call last):
File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\runpy.py", line 198, in _run_module_as_main
return _run_code(code, main_globals, None,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\runpy.py", line 88, in _run_code
exec(code, run_globals)
File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy\__main__.py", line 39, in <module>
cli.main()
File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 430, in main
run()
File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File ".\launcher.py", line 8, in <module>
bot.run(VERSION)
File ".\libs\bot\__init__.py", line 62, in run
super().run(self.TOKEN, reconnect=True)
File ".\Lib\site-packages\discord\client.py", line 860, in run
asyncio.run(runner())
File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\asyncio\runners.py", line 194, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\asyncio\runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 685, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File ".\Lib\site-packages\discord\client.py", line 849, in runner
await self.start(token, reconnect=reconnect)
File ".\Lib\site-packages\discord\client.py", line 777, in start
await self.login(token)
File ".\Lib\site-packages\discord\client.py", line 612, in login
data = await self.http.static_login(token)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\Lib\site-packages\discord\http.py", line 803, in static_login
data = await self.request(Route('GET', '/users/@me'))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\Lib\site-packages\discord\http.py", line 625, in request
async with self.__session.request(method, url, **kwargs) as response:
File ".\Lib\site-packages\aiohttp\client.py", line 1194, in __aenter__
self._resp = await self._coro
^^^^^^^^^^^^^^^^
File ".\Lib\site-packages\aiohttp\client.py", line 603, in _request
resp = await req.send(conn)
^^^^^^^^^^^^^^^^^^^^
File ".\Lib\site-packages\aiohttp\client_reqrep.py", line 713, in send
await writer.write_headers(status_line, self.headers)
File ".\Lib\site-packages\aiohttp\http_writer.py", line 129, in write_headers
buf = _serialize_headers(status_line, headers)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "aiohttp\\_http_writer.pyx", line 132, in aiohttp._http_writer._serialize_headers
File "aiohttp\\_http_writer.pyx", line 116, in aiohttp._http_writer._safe_header
ValueError: Newline or carriage return character detected in HTTP status message or header. This is a potential security issue.
I've made an effort to sanitise the returned HTML before it's passed like so
from discord.ext.commands import Cog, command
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import bleach
class Scraper:
@staticmethod
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()
@staticmethod
async def scrape(url):
html_content = await Scraper.fetch(url)
soup = BeautifulSoup(html_content, 'html.parser')
### SANITIZATION HERE
cleaned_html_content = bleach.clean(str(soup), tags=['span'], attributes={'span': ['class']}) ##whitelist tags
stripped_html_content = cleaned_html_content.replace('\n', '').replace('\r', '') ## remove that nasty stuff I hope?
##TODO: Check for anything else that needs sanitising for security too
soup = BeautifulSoup(stripped_html_content, 'html.parser') ##soup it up baybee
## SANITZATION ENDS
intros = soup.select('span.character_selfintroduction') ## grab the character bio
intro_texts = []
for intro in intros:
intro_text = Scraper.extract_text_from_span(intro)
intro_texts.append(intro_text)
return intro_texts
@staticmethod ## no idea how else to make this iterate on the html when it's nested
def extract_text_from_span(span):
text = ''
for child in span.children:
if isinstance(child, str):
text += child.strip()
elif child.name == 'span':
text += Scraper.extract_text_from_span(child)
return text
class xivLodestone(Cog):
def __init__(self, bot):
self.bot = bot
@command(name="get_whoami") ## test command for now, just return the bio
async def scrape_introductions(self, ctx, lodestoneID: int):
scraper = Scraper()
urls = ['https://eu.finalfantasyxiv.com/lodestone/character/{LodestoneID}',
'https://fr.finalfantasyxiv.com/lodestone/character/{LodestoneID}',
'https://de.finalfantasyxiv.com/lodestone/character/{LodestoneID}',
'https://jp.finalfantasyxiv.com/lodestone/character/{LodestoneID}',
'https://na.finalfantasyxiv.com/lodestone/character/{LodestoneID}'
] ## Added all of them as a fallback, might make that a list it iterates instead?
intro_texts = await asyncio.gather(*(scraper.scrape(url) for url in urls))
for intro_text_list in intro_texts:
for intro_text in intro_text_list:
await ctx.send(intro_text)
@Cog.listener()
async def on_ready(self):
if not self.bot.ready:
self.bot.cogs_ready.ready_up(
"xivLodestone")
def setup(bot):
bot.add_cog(xivLodestone(bot))
I expected the code above to have whitelisted only the html points I needed, while taking away any characters that pose a security risk.
I'm not sure what else to do in this case.
EDIT: It's been brought to my attention that I'd been focusing on HTML and not HTTP. I'll come back to this later if I can't figure it out but please forgive the newbie error.
EDIT 2: Added full traceback. cleaned the directories to remove some personal info and make them relative to the working directory or just that they're in appdata