From d1e65e73ada492a65ce53e14a169f8a8ae65bf0b Mon Sep 17 00:00:00 2001 From: Daniil-Sokolskiy Date: Wed, 7 Aug 2024 17:08:56 +0400 Subject: [PATCH] added the ability to parse the amazon page by link in cli from the current page to the end, the data is uploaded to the db --- Gemfile.lock | 1 + bin/run | 6 +- lib/example.rb | 133 +++++++++++++++++++++++++++++++++++++-- models/amazon_product.rb | 8 +++ 4 files changed, 141 insertions(+), 7 deletions(-) create mode 100644 models/amazon_product.rb diff --git a/Gemfile.lock b/Gemfile.lock index 82ddb07..0271dd1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -36,6 +36,7 @@ GEM PLATFORMS x86_64-darwin-20 x86_64-darwin-21 + x86_64-linux DEPENDENCIES autoload diff --git a/bin/run b/bin/run index 7bd6f1e..c1d282f 100755 --- a/bin/run +++ b/bin/run @@ -14,9 +14,9 @@ class App < Thor initialize_logger end - desc 'simple_parser', 'run simple driver' - def simple_parser - Parser::Example.run + desc 'simple_parser URL', 'run simple parser with given URL' + def simple_parser(url) + Parser::Example.run(url) end private diff --git a/lib/example.rb b/lib/example.rb index c10f066..2bd2097 100644 --- a/lib/example.rb +++ b/lib/example.rb @@ -1,13 +1,138 @@ +# lib/parser/example.rb +require 'selenium-webdriver' +require_relative '../models/amazon_product' +require 'uri' +require 'cgi' + module Parser class Example - def self.run + def self.run(url) logger = O14::ProjectLogger.get_logger driver = O14::WebBrowser.get_driver config = O14::Config.get_config db = O14::DB.get_db - - driver.navigate.to 'https://google.com' - sleep 30 + + begin + + loop do + driver.navigate.to url + sleep 5 + page_number = extract_page_number_from_pagination(driver) + puts "Current page: #{page_number}" + category_from_url = extract_category_from_url(url) + category_from_page = extract_category_from_page(driver) + + category = category_from_page || category_from_url + + products = driver.find_elements(css: '.s-main-slot .s-result-item') + + products.each do |product| + product_id = product.attribute('data-asin') + next if product_id.nil? || product_id.empty? + + begin + title = product.find_element(css: 'h2 .a-size-mini').text.strip + rescue + title = nil + end + + begin + price_whole = product.find_element(css: '.a-price .a-price-whole').text.gsub(',', '').to_f + price_fraction = product.find_element(css: '.a-price .a-price-fraction').text.to_f + price = price_whole + (price_fraction / 100) + rescue + price = nil + end + + begin + rating = product.find_element(css: '.a-icon-alt').text.split.first.to_f + rescue + rating = nil + end + + begin + review_count = product.find_element(css: '.s-link-style .a-size-base').text.gsub(',', '').to_i + rescue + review_count = nil + end + + begin + availability = product.find_element(css: '.a-size-base .a-color-price').text.strip + rescue + availability = nil + end + + begin + image_url = product.find_element(css: '.s-image').attribute('src') + rescue + image_url = nil + end + + begin + product_url = "https://www.amazon.com#{product.find_element(css: 'h2 .a-size-mini').attribute('href')}" + rescue + product_url = nil + end + + begin + AmazonProduct.create( + product_id: product_id, + title: title, + price: price, + rating: rating, + review_count: review_count, + availability: availability, + category: category, + image_url: image_url, + product_url: product_url + ) + rescue Sequel::UniqueConstraintViolation + puts "Duplicate entry for product_id: #{product_id}. Skipping." + end + end + + # Переход к следующей странице + next_button = driver.find_elements(css: 'a.s-pagination-next') + if next_button.empty? + puts "No more pages. Finished." + break + else + url = next_button.first.attribute('href') + page_number = extract_page_number_from_pagination(driver) + puts "Moving to the next page: #{page_number+1}" + end + end + ensure + driver.quit + end + end + +# def self.extract_page_number_from_url(url) +# uri = URI.parse(url) +# params = CGI.parse(uri.query) +# page_number = params['page']&.first&.to_i +# page_number = 1 if page_number.nil? || page_number.zero? +# page_number +# end + + def self.extract_page_number_from_pagination(driver) + current_page_element = driver.find_element(css: 'span.s-pagination-item.s-pagination-selected[aria-label^="Current page"]') + current_page_element.text.to_i + end + + def self.extract_category_from_url(url) + uri = URI.parse(url) + params = CGI.parse(uri.query) + params['i']&.first + end + + def self.extract_category_from_page(driver) + begin + breadcrumb = driver.find_element(css: 'nav.a-breadcrumb .a-breadcrumb-item') + breadcrumb.text.strip + rescue + nil + end end end end diff --git a/models/amazon_product.rb b/models/amazon_product.rb new file mode 100644 index 0000000..3fc817f --- /dev/null +++ b/models/amazon_product.rb @@ -0,0 +1,8 @@ +# lib/models/amazon_product.rb +require 'sequel' +require_relative '../lib/o14/db' + +DB = O14::DB.get_db + +class AmazonProduct < Sequel::Model(DB[:amazon_products]) +end \ No newline at end of file