Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions bin/run
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@ class App < Thor
initialize_logger
end

desc 'simple_parser', 'run simple driver'
def simple_parser
Parser::Example.run
# desc 'simple_parser', 'run simple driver'
# def simple_parser
# Parser::Example.run
# end

desc 'mosaic_parser', 'run mosaic driver'
def mosaic_parser
Parser::MosaicGoodsParser.run
end

private
Expand Down
1 change: 1 addition & 0 deletions lib/autoload.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
module Parser
autoload :Example, 'example'
autoload :MosaicGoodsParser, 'mosaic_goods_parser'
end

95 changes: 95 additions & 0 deletions lib/mosaic_goods_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
require 'mechanize'
require 'nokogiri'

module Parser
class MosaicGoodsParser
def self.run

driver = O14::WebBrowser.get_driver
db = O14::DB.get_db

# Create agent Mechanize
agent = Mechanize.new

headers = {
'host' => 'www.amazon.com',
'user-agent' => 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36',
'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language' => 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'accept-encoding' => 'gzip, deflate, br, zstd',
'upgrade-insecure-requests' => 1,
'sec-fetch-dest' => 'document',
'sec-fetch-mode' => 'navigate',
'sec-fetch-site' => 'same-origin',
'connection' => 'keep-alive',
'cookie' => 'session-id=134-9346825-3549307; ubid-main=135-3288814-5967735; x-main="yabq?Ncy6gVzDfSnj06z8NkBu78cHLKe4QcMlkl7AP40lf6ebDs8bXMeA189lyEr"; at-main=Atza|IwEBICK34oX7TI5cwZF5EFdpJPCChsagmH0SfTiPBt6u9WP230Es8PE2zJ24cjSXH4dL772p8NUCQqpiZy1rvJkSiLtSNImjR_im13UnfuoHtmmkmsYbZw1_-Kq5qpjVmcTLhIZU93uAU8-d-sH6Q7dLfxs-hLJQogWo0eVf-ZlspbZy9-_1b9qrn3FZH4sVQbW2Nt4av-dSwEOFMU8BZmovpzPbozCaxD1MNHNqShcfU3NAPg; sess-at-main="znUvWopJIDUsHdLH47k3Yhol7ad+m5m+jPEilXF0o0k="; sst-main=Sst1|PQF1R1AZby-6UYG7VqDx8797CQyX_jc6vCkCUWOIkz4XShd_VT24y5fzLHPuOpBB6GzuAD2A8LhZaNmPJswyR6ZiteJkesk3wSThafAgTN78EtPdnyNuTg7qrJjC3uEWi2Pd4z7CtMPu0A9US__yoOq8GO0aRxIJ1AwOAGJQr-dysSpnNbrWhdkGRPCvyAjOtpAB_P1ABttjk2xvLVNBOFxaKICn72LhZ2BN_FQQga2fnHKQLMVnzZGIvLipcEIHPrjBKJJ3Lk5QF13mjuMYx8ceTA-A5XgJ4MiyyunYwYvJXVk; lc-main=en_US; session-id-time=2082787201l; i18n-prefs=USD; session-token=1BaTGtdXysiwvFJBdamdM8z9LICkQoUSTdLgfUmQdvBgVWoVMtYt7O6DFkFO9KH2nvXZ6qg7vz5grjrhuFohBE7juORZXfSLS2Agaf3BIgfakmgI95ZLvK79vbe6ggxrDjmElQA0qUCgq2IEvN1tdUjhYqAYEqncOr8BLqE0OhKziUMRm/wCk1Cndc83XUx+HISJC1FvLrAY8yD/UOliZGUKIz8h200IrsVcC9D/67mKLY7bUnMAx8SOcOv1uJ6oVqLOz0knKnBDS370fyC50tB47FyH8XY/1iS93CkD91Chz2UjJuo0gPqPzx8220Ys8+c7Cf746rI4EK/xhOzJiYBUc0kv20Ahcw5QGrz/7V8yZv0m0inoiGq4fyFtT5+J; csm-hit=tb:s-D0TQ70SMSNMZ64BW4S8G|1714217643142&t:1714217646290&adb:adblk_no',
'pragma' => 'no-cache',
'cache-control' => 'max-age=0'
}

# SQL query from data selection from the "url" column
query = "SELECT url FROM goods"

# Execute a database query

word_to_remove = 'New'
word_to_remove2 = 'all'
id = 0


results = db[query].all
# Processing query results
results.each do |row|
id += 1
url = row[:url]

# Selenium
driver.navigate.to(url)
# Price
current_price_text = driver.find_element(css: '.a-box-inner span.a-price>.a-offscreen').attribute("innerHTML").scan(/\d+\.\d+/).first.to_f rescue 0
# Other sellers
new_offers_el = driver.find_element(css: '.olp-text-box span').scan(/\d+/).first.to_i rescue nil
new_offers_el ||= driver.find_element(css: '#dynamic-aod-ingress-box .a-declarative span').attribute('innerHTML').scan(/\d+/).first.to_i rescue 0

# Rank
product_details_el = driver.find_element(css: '.detail-bullets-wrapper') rescue nil
if product_details_el
item_html = product_details_el.attribute("outerHTML")
item_el = Nokogiri::HTML5(item_html)
rank_el = item_el.xpath("//*[contains(text(), 'Best Sellers Rank')]/parent::*").first.text.scan(/\d+,\d+/).first.gsub(',', '').to_i rescue 0
else
rank_el = driver.find_element(xpath: "//th[contains(text(), 'Best Sellers Rank')]/parent::tr/td").text.scan(/\d+,\d+/).first.gsub(',', '').to_i rescue 0
end


# Mechanize
response = agent.get(url, [], row['url'], headers)
# Find the price of the product on the page
product_price = response.search("span.aok-offscreen")[0].text.to_s.strip.scan(/\d+\.\d+/).first.to_f rescue 0

# Other sellers
sellers_info = response.search('.a-declarative .a-color-base').text.match(/New \(.*?\d+\)/).to_s.sub(/\b#{word_to_remove}\b/, '').strip.scan(/\d+/)[0].strip.scan(/\d+/)[0].to_i rescue nil
sellers_info ||= response.search('.a-declarative .a-box-inner .a-section .a-size-base').text.scan(/\.*?\d+/)[0].to_i rescue nil
sellers_info ||= response.search('.a-declarative .a-link-normal > span').text.match(/all.?\d+\ /).to_s.sub(/\b#{word_to_remove2}\b/, '').to_i rescue 0
check = response.search('.a-declarative .a-color-base').text.match(/\(.*?\d+\)/)
if check == nil
sellers_info = response.search('.a-declarative .a-link-normal > span').text.match(/all.?\d+\ /).to_s.sub(/\b#{word_to_remove2}\b/, '').to_i rescue 0
end

# Find an element with information about Best Sellers Rank
best_sellers_rank = response.search('.zgFirstRank').text.scan(/\d+,\d+/).first.gsub(',', '').to_i rescue 0

# Adding Data to the Database
db[:goods].where(id:id).update(browser_price:current_price_text, browser_rank:rank_el, browser_new_count:new_offers_el, xhr_price:product_price, xhr_rank:best_sellers_rank, xhr_new_count:sellers_info)
end

# Close driver Selenium
driver.quit_browser

# Closing the database connection
db.disconnect
end
end
end