I want to design python web scraping code to scrape these data (https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). Here is the code:
import os
import requests
import random
import time
import pyarrow.parquet as pq
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from fake_useragent import UserAgent
# URL de la page contenant les liens vers les datasets
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "html.parser")
# Chemin où enregistrer les fichiers
download_directory = "C:/Users/flosr/Engineering/Blent.ai Project/datas"
# Fonction pour télécharger un fichier avec un en-tête utilisateur aléatoire et une pause aléatoire
def download_file(url, file_path):
user_agent = UserAgent().random
headers = {"User-Agent": user_agent}
time.sleep(random.uniform(1, 3)) # Ajouter une pause aléatoire entre 1 et 3 secondes
response = requests.get(url, headers=headers)
with open(file_path, "w") as f:
f.write(response.content)
# Parcourir chaque section contenant les liens pour chaque année
for section in soup.find_all("div", class_="faq-answers"):
year = section.find_previous_sibling("div", class_="faq-questions").text.strip()
print(f"Downloading datasets for year {year}...")
# Créer un sous-répertoire pour chaque année
year_directory = os.path.join(download_directory, year)
os.makedirs(year_directory, exist_ok=True)
# Télécharger les fichiers pour chaque mois de l'année
for link in section.find_all("a"):
file_url = urljoin(base_url, link.get("href"))
filename = os.path.basename(file_url)
file_path = os.path.join(year_directory, filename)
# Télécharger le fichier
print(f"Downloading {filename}...")
download_file(file_url, file_path)
# Convertir le fichier Parquet
pq.write_table(pq.read_table(file_path), file_path.replace('.parquet', '.csv'))
print("Download and conversion complete.")
Here is the output :
PS C:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code> & 'c:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code\env\Scripts\python.exe' 'c:\Users\flosr\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher' '63645' '--' 'C:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code\env\main.py'
Download and conversion complete.
However, nothing appears in the said directory. No error appearing but it still doesnt work. and for some reason it never stops installing dependencies below without any end.
cant try anything if i dont have any error appearing to know whats the problem
Seems that some URLs have whitespace character at the end that needs to be stripped:
Prints: