Sara Jitkresorn :
I'm using the code to web scrape customer reviews. Everything works according to what I wanted the code to do but I couldn't get the class or attribute for the ratings right, so the code always returns blank results for the Ratings
column.
Could someone help me find the right attribute and fix the Ratings
code line?
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
print ('all imported successfuly')
# Initialize an empty dataframe
df = pd.DataFrame()
for x in range(1, 37):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
link = (f'https://www.trustpilot.com/review/fabfitfun.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('article', {'class':'review'})
for article in articles:
names.append(article.find('div', attrs={'class': 'consumer-information__name'}).text.strip())
headers.append(article.find('h2', attrs={'class':'review-content__title'}).text.strip())
try:
bodies.append(article.find('p', attrs={'class':'review-content__text'}).text.strip())
except:
bodies.append('')
try:
#ratings.append(article.find('div', attrs={'class':'star-rating star-rating--medium'}).text.strip())
ratings.append(article.find('div', attrs={'class': 'star-rating star-rating--medium'})['alt'])
except:
ratings.append('')
dateElements = article.find('div', attrs={'class':'review-content-header__dates'}).text.strip()
jsonData = json.loads(dateElements)
published.append(jsonData['publishedDate'])
updated.append(jsonData['updatedDate'])
reported.append(jsonData['reportedDate'])
# Create your temporary dataframe of the first iteration, then append that into your "final" dataframe
temp_df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Published Date': published, 'Updated Date':updated, 'Reported Date':reported})
df = df.append(temp_df, sort=False).reset_index(drop=True)
print ('pass1')
df.to_csv('FabfitfunReviews007.csv', index=False, encoding='utf-8')
print ('excel done')
petezurich :
Just change this line in your code:
ratings.append(article.find_all("img", alt=True)[0]["alt"])
df.Rating
then outputs to:
0 1 star: Bad
1 5 stars: Excellent
2 5 stars: Excellent
3 5 stars: Excellent
4 5 stars: Excellent
5 5 stars: Excellent
6 5 stars: Excellent
It seems easier to just find the img
tag in the article and retrieve the alt-text from it.