From 3084d3abb1418ddcc1ec0908ab8049726100cdc3 Mon Sep 17 00:00:00 2001 From: Alyosha Romanov Date: Sat, 18 Nov 2023 15:55:42 +0100 Subject: [PATCH 1/6] Fixed script. --- app.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index c4b67e4..f89fd5e 100644 --- a/app.py +++ b/app.py @@ -34,14 +34,23 @@ def scrape(url): data = extractor.extract(r.text,base_url=url) reviews = [] for r in data['reviews']: - r["product"] = data["product_title"] + r['title'] = r['title'].split(' out of 5 stars ')[-1] + r['product'] = data['product_title'] r['url'] = url + if r['found_helpful'] is None: + r['found_helpful'] = 0 + elif 'One person found this helpful' in r['found_helpful']: + r['found_helpful'] = 1 + elif 'people found this helpful' in r['found_helpful']: + r['found_helpful'] = int(r['found_helpful'].split()[0]) + else: + r['found_helpful'] = 0 if 'verified_purchase' in r: if 'Verified Purchase' in r['verified_purchase']: r['verified_purchase'] = True else: r['verified_purchase'] = False - r['rating'] = r['rating'].split(' out of')[0] + r['rating'] = r['title'].split(' out of')[0] date_posted = r['date'].split('on ')[-1] if r['images']: r['images'] = "\n".join(r['images']) @@ -53,7 +62,7 @@ def scrape(url): data['histogram'] = histogram data['average_rating'] = float(data['average_rating'].split(' out')[0]) data['reviews'] = reviews - data['number_of_reviews'] = int(data['number_of_reviews'].split(' customer')[0]) + data['number_of_reviews'] = int(data['number_of_reviews'].split(' global ratings')[0].replace(',','')) return data @app.route('/') From aa274f1605fd39c95fb92e93c23df5b518ad2b54 Mon Sep 17 00:00:00 2001 From: Alyosha Romanov Date: Sun, 19 Nov 2023 20:16:08 +0100 Subject: [PATCH 2/6] Fix for rating not being parsed properly and removed install requirements. --- app.py | 2 +- requirements.txt | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) delete mode 100644 requirements.txt diff --git a/app.py b/app.py index f89fd5e..a6b9db7 100644 --- a/app.py +++ b/app.py @@ -34,6 +34,7 @@ def scrape(url): data = extractor.extract(r.text,base_url=url) reviews = [] for r in data['reviews']: + r['rating'] = int(float(r['title'].split(' out of')[0])) r['title'] = r['title'].split(' out of 5 stars ')[-1] r['product'] = data['product_title'] r['url'] = url @@ -50,7 +51,6 @@ def scrape(url): r['verified_purchase'] = True else: r['verified_purchase'] = False - r['rating'] = r['title'].split(' out of')[0] date_posted = r['date'].split('on ')[-1] if r['images']: r['images'] = "\n".join(r['images']) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e0f806e..0000000 --- a/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -click==7.1.1 -cssselect==1.1.0 -Flask==1.1.2 -itsdangerous==1.1.0 -Jinja2==2.11.2 -lxml==4.5.0 -MarkupSafe==1.1.1 -parsel==1.5.2 -python-dateutil==2.8.1 -PyYAML==5.3.1 -selectorlib==0.16.0 -six==1.14.0 -w3lib==1.21.0 -Werkzeug==1.0.1 From 36d01ebf6a827625ad960d2dd2b07dcd046f5403 Mon Sep 17 00:00:00 2001 From: vikramdara Date: Sun, 19 Nov 2023 23:18:42 -0600 Subject: [PATCH 3/6] modified code all reviews from all pages are seen --- app.py | 110 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/app.py b/app.py index a6b9db7..f17dbc2 100644 --- a/app.py +++ b/app.py @@ -1,67 +1,73 @@ from flask import Flask, request, jsonify import selectorlib import requests +import json from dateutil import parser as dateparser app = Flask(__name__) extractor = selectorlib.Extractor.from_yaml_file('selectors.yml') -def scrape(url): - headers = { - 'authority': 'www.amazon.com', - 'pragma': 'no-cache', - 'cache-control': 'no-cache', - 'dnt': '1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'sec-fetch-site': 'none', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-dest': 'document', - 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', - } +def scrape(url): + all_reviews = [] + while url: - # Download the page using requests - print("Downloading %s"%url) - r = requests.get(url, headers=headers) - # Simple check to check if page was blocked (Usually 503) - if r.status_code > 500: - if "To discuss automated access to Amazon data please contact" in r.text: - print("Page %s was blocked by Amazon. Please try using better proxies\n"%url) - else: - print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) - return None - # Pass the HTML of the page and create - data = extractor.extract(r.text,base_url=url) - reviews = [] - for r in data['reviews']: - r['rating'] = int(float(r['title'].split(' out of')[0])) - r['title'] = r['title'].split(' out of 5 stars ')[-1] - r['product'] = data['product_title'] - r['url'] = url - if r['found_helpful'] is None: - r['found_helpful'] = 0 - elif 'One person found this helpful' in r['found_helpful']: - r['found_helpful'] = 1 - elif 'people found this helpful' in r['found_helpful']: - r['found_helpful'] = int(r['found_helpful'].split()[0]) - else: - r['found_helpful'] = 0 - if 'verified_purchase' in r: - if 'Verified Purchase' in r['verified_purchase']: - r['verified_purchase'] = True + headers = { + 'authority': 'www.amazon.com', + 'pragma': 'no-cache', + 'cache-control': 'no-cache', + 'dnt': '1', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'sec-fetch-site': 'none', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-dest': 'document', + 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', + } + + # Download the page using requests + print("Downloading %s"%url) + r = requests.get(url, headers=headers) + # Simple check to check if page was blocked (Usually 503) + if r.status_code > 500: + if "To discuss automated access to Amazon data please contact" in r.text: + print("Page %s was blocked by Amazon. Please try using better proxies\n"%url) + else: + print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) + return None + # Pass the HTML of the page and create + data = extractor.extract(r.text,base_url=url) + reviews = [] + for r in data['reviews']: + r['rating'] = int(float(r['title'].split(' out of')[0])) + r['title'] = r['title'].split(' out of 5 stars ')[-1] + r['product'] = data['product_title'] + r['url'] = url + if r['found_helpful'] is None: + r['found_helpful'] = 0 + elif 'One person found this helpful' in r['found_helpful']: + r['found_helpful'] = 1 + elif 'people found this helpful' in r['found_helpful']: + r['found_helpful'] = int(r['found_helpful'].split()[0]) else: - r['verified_purchase'] = False - date_posted = r['date'].split('on ')[-1] - if r['images']: - r['images'] = "\n".join(r['images']) - r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y') - reviews.append(r) + r['found_helpful'] = 0 + if 'verified_purchase' in r: + if 'Verified Purchase' in r['verified_purchase']: + r['verified_purchase'] = True + else: + r['verified_purchase'] = False + date_posted = r['date'].split('on ')[-1] + if r['images']: + r['images'] = "\n".join(r['images']) + r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y') + reviews.append(r) + all_reviews.extend(reviews) + url = data.get('next_page') + data['reviews'] = all_reviews histogram = {} for h in data['histogram']: histogram[h['key']] = h['value'] data['histogram'] = histogram data['average_rating'] = float(data['average_rating'].split(' out')[0]) - data['reviews'] = reviews data['number_of_reviews'] = int(data['number_of_reviews'].split(' global ratings')[0].replace(',','')) return data @@ -70,5 +76,5 @@ def api(): url = request.args.get('url',None) if url: data = scrape(url) - return jsonify(data) - return jsonify({'error':'URL to scrape is not provided'}),400 \ No newline at end of file + return json.dumps(data, indent=2), 200, {'Content-Type': 'application/json; charset=utf-8'} + return json.dumps({'error': 'URL to scrape is not provided'}, indent=2), 400, {'Content-Type': 'application/json; charset=utf-8'} \ No newline at end of file From 1a4300c06f2bb02f59e8dfe3c884b90a985c4e3b Mon Sep 17 00:00:00 2001 From: Alyosha Romanov Date: Sun, 26 Nov 2023 22:29:49 +0100 Subject: [PATCH 4/6] Only load one page, but with specified page. Fix for NoneType error. --- app.py | 109 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/app.py b/app.py index f17dbc2..26ea9ac 100644 --- a/app.py +++ b/app.py @@ -7,62 +7,57 @@ extractor = selectorlib.Extractor.from_yaml_file('selectors.yml') def scrape(url): - all_reviews = [] - while url: + headers = { + 'authority': 'www.amazon.com', + 'pragma': 'no-cache', + 'cache-control': 'no-cache', + 'dnt': '1', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'sec-fetch-site': 'none', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-dest': 'document', + 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', + } - headers = { - 'authority': 'www.amazon.com', - 'pragma': 'no-cache', - 'cache-control': 'no-cache', - 'dnt': '1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'sec-fetch-site': 'none', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-dest': 'document', - 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', - } - - # Download the page using requests - print("Downloading %s"%url) - r = requests.get(url, headers=headers) - # Simple check to check if page was blocked (Usually 503) - if r.status_code > 500: - if "To discuss automated access to Amazon data please contact" in r.text: - print("Page %s was blocked by Amazon. Please try using better proxies\n"%url) - else: - print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) - return None - # Pass the HTML of the page and create - data = extractor.extract(r.text,base_url=url) - reviews = [] - for r in data['reviews']: - r['rating'] = int(float(r['title'].split(' out of')[0])) - r['title'] = r['title'].split(' out of 5 stars ')[-1] - r['product'] = data['product_title'] - r['url'] = url - if r['found_helpful'] is None: - r['found_helpful'] = 0 - elif 'One person found this helpful' in r['found_helpful']: - r['found_helpful'] = 1 - elif 'people found this helpful' in r['found_helpful']: - r['found_helpful'] = int(r['found_helpful'].split()[0]) + # Download the page using requests + print("Downloading %s"%url) + r = requests.get(url, headers=headers) + # Simple check to check if page was blocked (Usually 503) + if r.status_code > 500: + if "To discuss automated access to Amazon data please contact" in r.text: + print("Page %s was blocked by Amazon. Please try using better proxies\n"%url) + else: + print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) + return None + # Pass the HTML of the page and create + data = extractor.extract(r.text,base_url=url) + reviews = [] + for r in data['reviews']: + r['rating'] = int(float(r['title'].split(' out of')[0])) + r['title'] = r['title'].split(' out of 5 stars ')[-1] + r['product'] = data['product_title'] + r['url'] = url + if r['found_helpful'] is None: + r['found_helpful'] = 0 + elif 'One person found this helpful' in r['found_helpful']: + r['found_helpful'] = 1 + elif 'people found this helpful' in r['found_helpful']: + r['found_helpful'] = int(r['found_helpful'].split()[0]) + else: + r['found_helpful'] = 0 + if 'verified_purchase' in r and r['verified_purchase'] is not None: + if 'Verified Purchase' in r['verified_purchase']: + r['verified_purchase'] = True else: - r['found_helpful'] = 0 - if 'verified_purchase' in r: - if 'Verified Purchase' in r['verified_purchase']: - r['verified_purchase'] = True - else: - r['verified_purchase'] = False - date_posted = r['date'].split('on ')[-1] - if r['images']: - r['images'] = "\n".join(r['images']) - r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y') - reviews.append(r) - all_reviews.extend(reviews) - url = data.get('next_page') - data['reviews'] = all_reviews + r['verified_purchase'] = False + date_posted = r['date'].split('on ')[-1] + if r['images']: + r['images'] = "\n".join(r['images']) + r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y') + reviews.append(r) + data['reviews'] = reviews histogram = {} for h in data['histogram']: histogram[h['key']] = h['value'] @@ -74,6 +69,12 @@ def scrape(url): @app.route('/') def api(): url = request.args.get('url',None) + if request.args.get('pageNumber',None) is None: + url += '&pageNumber=1' + elif int(request.args.get('pageNumber',None)) <= 10: + url += '&pageNumber='+request.args.get('pageNumber',None) + else: + return json.dumps({'error': 'Page number should be less than or equal to 10'}, indent=2), 400, {'Content-Type': 'application/json; charset=utf-8'} if url: data = scrape(url) return json.dumps(data, indent=2), 200, {'Content-Type': 'application/json; charset=utf-8'} From ca0baa23e7fcbe6686cca178886be74257fb8324 Mon Sep 17 00:00:00 2001 From: Alyosha Romanov Date: Fri, 8 Dec 2023 23:47:14 +0100 Subject: [PATCH 5/6] Fixed None type error and implemented exceptions for errors. --- .gitignore | 9 ++++++++- app.py | 48 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index c0b8c74..90b6e22 100644 --- a/.gitignore +++ b/.gitignore @@ -136,4 +136,11 @@ dmypy.json # Cython debug symbols cython_debug/ -.vscode/ \ No newline at end of file +.vscode/### Example user template template +### Example user template + +# IntelliJ project files +.idea +*.iml +out +gen diff --git a/app.py b/app.py index 26ea9ac..baa6505 100644 --- a/app.py +++ b/app.py @@ -3,9 +3,11 @@ import requests import json from dateutil import parser as dateparser + app = Flask(__name__) extractor = selectorlib.Extractor.from_yaml_file('selectors.yml') + def scrape(url): headers = { 'authority': 'www.amazon.com', @@ -22,17 +24,22 @@ def scrape(url): } # Download the page using requests - print("Downloading %s"%url) + print("Downloading %s" % url) r = requests.get(url, headers=headers) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: - print("Page %s was blocked by Amazon. Please try using better proxies\n"%url) + raise Exception("Page %s was blocked by Amazon. Please try using better proxies\n" % url) else: - print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) - return None + raise Exception("Page %s must have been blocked by Amazon as the status code was %d" % (url, r.status_code)) + # Pass the HTML of the page and create - data = extractor.extract(r.text,base_url=url) + data = extractor.extract(r.text, base_url=url) + + # check if the extracted data is empty + if data['reviews'] is None: + raise Exception("ERROR: No data extracted. Check selector config") + reviews = [] for r in data['reviews']: r['rating'] = int(float(r['title'].split(' out of')[0])) @@ -63,19 +70,28 @@ def scrape(url): histogram[h['key']] = h['value'] data['histogram'] = histogram data['average_rating'] = float(data['average_rating'].split(' out')[0]) - data['number_of_reviews'] = int(data['number_of_reviews'].split(' global ratings')[0].replace(',','')) - return data - + data['number_of_reviews'] = int(data['number_of_reviews'].split(' global ratings')[0].replace(',', '')) + return data + + +def to_json(data, status): + return json.dumps(data, indent=2), status, {'Content-Type': 'application/json; charset=utf-8'} + + @app.route('/') def api(): - url = request.args.get('url',None) - if request.args.get('pageNumber',None) is None: + url = request.args.get('url', None) + if request.args.get('pageNumber', None) is None: url += '&pageNumber=1' - elif int(request.args.get('pageNumber',None)) <= 10: - url += '&pageNumber='+request.args.get('pageNumber',None) + elif int(request.args.get('pageNumber', None)) <= 10: + url += '&pageNumber=' + request.args.get('pageNumber', None) else: - return json.dumps({'error': 'Page number should be less than or equal to 10'}, indent=2), 400, {'Content-Type': 'application/json; charset=utf-8'} + return to_json({'error': 'Page number should be less than or equal to 10'}, 400) + if url: - data = scrape(url) - return json.dumps(data, indent=2), 200, {'Content-Type': 'application/json; charset=utf-8'} - return json.dumps({'error': 'URL to scrape is not provided'}, indent=2), 400, {'Content-Type': 'application/json; charset=utf-8'} \ No newline at end of file + try: + data = scrape(url) + return to_json(data, 200) + except Exception as e: + return to_json({'error': str(e)}, 400) + return to_json({'error': 'URL to scrape is not provided'}, 400) From 148d911bb97bca1cda8fb6a44645eab7aad09535 Mon Sep 17 00:00:00 2001 From: Alyosha Romanov Date: Sat, 9 Dec 2023 00:09:07 +0100 Subject: [PATCH 6/6] Fixed None type error and implemented exceptions for errors. Updated readme and added requirements.txt --- .gitignore | 1 - README.md | 4 ++-- app.py | 6 +++--- requirements.txt | 5 +++++ 4 files changed, 10 insertions(+), 6 deletions(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 90b6e22..c2969b0 100644 --- a/.gitignore +++ b/.gitignore @@ -137,7 +137,6 @@ dmypy.json # Cython debug symbols cython_debug/ .vscode/### Example user template template -### Example user template # IntelliJ project files .idea diff --git a/README.md b/README.md index 912c3aa..4c6a053 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ A very basic API to scrape product reviews from Amazon and get data in real time as JSON with all fields, that amazon product adverstising api does not provide you. -Full Tutorail - [Amazon Product Reviews API – build you own using Python](https://www.scrapehero.com/free-amazon-product-reviews-api-build-you-own-using-python/) +Forked from [ScrapeHero/Amazon-Review-Scraper](https://github.com/scrapehero-code/amazon-review-scraper) ## Usage @@ -11,7 +11,7 @@ Go into the project folder 1. Install requirements `pip install -r requirements.txt` 2. Set FLASK_APP - `export FLASK_APP=app.py` 3. Run App - `flask run` -4. Call API with Review Page URL. Eg: `http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews` +4. Call API with Review Page URL. Eg: [`http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews`](http://localhost:5000/?url=https://www.amazon.com/Nike-Womens-Reax-Running-Shoes/product-reviews/B07ZPL752N/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews) ## Example Data Format diff --git a/app.py b/app.py index baa6505..ef3e242 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,4 @@ -from flask import Flask, request, jsonify +from flask import Flask, request import selectorlib import requests import json @@ -74,7 +74,7 @@ def scrape(url): return data -def to_json(data, status): +def to_json(data, status=200): return json.dumps(data, indent=2), status, {'Content-Type': 'application/json; charset=utf-8'} @@ -91,7 +91,7 @@ def api(): if url: try: data = scrape(url) - return to_json(data, 200) + return to_json(data) except Exception as e: return to_json({'error': str(e)}, 400) return to_json({'error': 'URL to scrape is not provided'}, 400) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..be90693 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +flask +selectorlib +requests +json +dateutil \ No newline at end of file