-
1
class API::BaseController < ApplicationController
-
1
before_filter :authenticate
-
-
1
def authenticate
-
authenticate_or_request_with_http_basic do |username, password|
-
username == MARKET_DATA_API_CONFIG[:username]
-
password == MARKET_DATA_API_CONFIG[:password]
-
end
-
end
-
-
1
private
-
-
1
def api_response(status, data = {})
-
{version: params[:version],
-
status: status}.merge data
-
end
-
end
-
1
class API::BookVersionCategoriesController < API::BaseController
-
-
1
def index
-
if params[:warehouse_book_version_id].present?
-
relation = BookVersionCategory.where(warehouse_book_version_id: params[:warehouse_book_version_id].split(','))
-
-
render json: api_response(:ok, book_version_categories: BookVersionCategorySerializer.new(relation.to_a))
-
elsif params[:category_name].present?
-
relation = BookVersionCategory.where(category_name: params[:category_name].split(','))
-
-
render json: api_response(:ok, book_version_categories: BookVersionCategorySerializer.new(relation.to_a))
-
else
-
render json: api_response(:error)
-
end
-
end
-
-
1
def show
-
book_version_category = BookVersionCategory.find(params[:id])
-
response = book_version_category.present? ? api_response(:ok, book_version_category: BookVersionCategorySerializer.new(book_version_category)) : api_response(:error)
-
-
render json: response
-
end
-
end
-
1
class API::BookVersionsController < API::BaseController
-
1
def index
-
if Utilities::TLDS.include?(params[:tld]) && params[:keys].present?
-
relation = WarehouseBookVersion.where(tld: params[:tld]).where{(asin.in my{params[:keys].split(',')}) | (isbn13.in my{params[:keys].split(',')})}
-
-
render json: api_response(:ok, book_versions: BookVersionSerializer.new(relation.to_a))
-
else
-
render json: api_response(:error)
-
end
-
end
-
-
1
def show
-
book_version = WarehouseBookVersion.find(params[:id])
-
response = book_version.present? ? api_response(:ok, book_version: BookVersionSerializer.new(book_version)) : api_response(:error)
-
-
render json: response
-
end
-
end
-
1
class API::CategoriesController < API::BaseController
-
1
def index
-
if Utilities::TLDS.include?(params[:tld]) && params[:name].present?
-
relation = WarehouseCategory.where(tld: params[:tld], name: params[:name].split(','))
-
-
render json: api_response(:ok, categories: CategorySerializer.new(relation.to_a))
-
else
-
render json: api_response(:error)
-
end
-
end
-
-
1
def show
-
category = WarehouseCategory.find(params[:id])
-
response = category.present? ? api_response(:ok, category: CategorySerializer.new(category)) : api_response(:error)
-
-
render json: response
-
end
-
end
-
1
class API::CategoryStatsController < API::BaseController
-
1
SELECTABLE_FIELDS = %i[warehouse_category_id warehouse_category_external_id]
-
-
1
def index
-
if (params[:date].present? || params[:start_date].present? && params[:end_date].present?) && Utilities::TLDS.include?(params[:tld]) && SELECTABLE_FIELDS.any? {|field| params[field].present?}
-
dates = params[:date].present? ? [params[:date].to_date] : (params[:start_date].to_date..params[:end_date].to_date).to_a
-
relation = CategoryStat.select('category_stats.*, warehouse_dates.date as date, warehouse_regions.tld').
-
includes(:warehouse_region, :warehouse_date, :warehouse_category).references(:all)
-
where{warehouse_dates.date.in dates}
-
where{warehouse_regions.tld == my{params[:tld]}}
-
relation = relation.where(warehouse_category_id: params[:warehouse_category_id].split(',')) if params[:warehouse_category_id].present?
-
relation = relation.where{warehouse_categories.category_id.in my{params[:warehouse_category_external_id].split(',')}} if params[:warehouse_category_external_id].present?
-
-
render json: api_response(:ok, category_stat: CategoryStatSerializer.new(relation.to_a))
-
else
-
render json: api_response(:error)
-
end
-
end
-
-
1
def show
-
category_stat = CategoryStat.find(params[:id])
-
response = category_stat.present? ? api_response(:ok, category_stat: CategoryStatSerializer.new(category_stat)) : api_response(:error)
-
-
render json: response
-
end
-
end
-
1
class API::ProductStatsController < API::BaseController
-
1
def index
-
if (params[:date].present? || params[:start_date].present? && params[:end_date].present?) && Utilities::TLDS.include?(params[:tld]) && (params[:keys].present? || params[:warehouse_book_version_ids])
-
warehouse_date_ids = params[:date].present? ? WarehouseDate.find_by(date: params[:date].to_date).id : WarehouseDate.where(date: params[:start_date].to_date..params[:end_date].to_date).value_of(:id)
-
relation = base_relation.where(warehouse_date_id: warehouse_date_ids).where{warehouse_book_versions.tld == my{params[:tld]}}.references(:all)
-
relation = if params[:warehouse_book_version_ids].present?
-
relation.where(warehouse_book_version_id: params[:warehouse_book_version_ids].split(','))
-
else
-
keys_array = params[:keys].split(',')
-
relation.where{(warehouse_book_versions.asin.in my{keys_array}) | (warehouse_book_versions.isbn13.in my{keys_array})}.references(:all)
-
end
-
-
render json: api_response(:ok, product_stats: ProductStatSerializer.new(relation.to_a))
-
else
-
render json: api_response(:error)
-
end
-
end
-
-
1
def show
-
product_stat = base_relation.find_by(id: params[:id])
-
response = product_stat.present? ? api_response(:ok, product_stat: ProductStatSerializer.new(product_stat)) : api_response(:error)
-
-
render json: response
-
end
-
-
1
private
-
-
1
def base_relation
-
WarehouseStat.includes(:warehouse_book_version, :warehouse_date)
-
end
-
end
-
1
class EnterpriseReportsMailer < ActionMailer::Base
-
1
default from: 'data@vook.com'
-
-
1
helper :application
-
-
1
def basic_report(report_hash, report_config)
-
7
to = Array.wrap(report_config[:recipients][Rails.env]) + Array.wrap(AmazeBot.config[:reports][:recipients_all_reports][Rails.env])
-
-
7
header = SmtpApiHeader.new
-
7
header.add_to to
-
-
7
@report_hashes = Array.wrap report_hash
-
7
@report_config = report_config
-
-
7
subject = "#{report_config[:email][:subject]} (#{l(Time.current.to_date, format: :medium).squish}#{" -- #{Rails.env} environment" unless Rails.env.production?})"
-
-
7
mail subject: subject, to: to, 'X-SMTPAPI' => header.as_json
-
end
-
-
1
def booklr_status_report(stat_hash)
-
3
to = Array.wrap(AmazeBot.config[:reports][:recipients_all_reports][Rails.env])
-
-
3
header = SmtpApiHeader.new
-
3
header.add_to to
-
-
3
@stat_hash = stat_hash
-
-
3
subject = "Booklr Daily Status Report (#{l(Time.current.to_date, format: :medium).squish}#{" -- #{Rails.env} environment" unless Rails.env.production?})"
-
-
3
mail subject: subject, to: to, 'X-SMTPAPI' => header.as_json
-
end
-
end
-
1
class InternalReportsMailer < ActionMailer::Base
-
1
default from: 'data@vook.com'
-
-
1
helper :application
-
-
1
def basic_report(report_id, file_details, report_config)
-
to = Array.wrap(report_config[:recipients][Rails.env]) + Array.wrap(AmazeBot.config[:reports][:recipients_all_reports][Rails.env])
-
-
header = SmtpApiHeader.new
-
header.add_to to
-
-
@report_id = report_id
-
@file_details = file_details
-
@report_config = report_config
-
-
subject = "#{report_config[:email][:subject]} (#{l(Time.current.to_date, format: :medium).squish}#{" -- #{Rails.env} environment" unless Rails.env.production?})"
-
-
mail subject: subject, to: to, 'X-SMTPAPI' => header.as_json
-
end
-
end
-
1
class NotificationMailer < ActionMailer::Base
-
1
default from: 'data@vook.com', to: 'data@vook.com'
-
1
@header = SmtpApiHeader.new default_params[:to]
-
-
1
def scraper_count_error(message)
-
8
@message = message
-
8
mail subject: "Wrong number of stats queued up", 'X-SMTPAPI' => @header.as_json
-
end
-
-
1
def report_row_count_error(report_identifier, message)
-
2
@message = message
-
2
mail subject: "[#{Utilities.env}] #{report_identifier} - Report Blocked", 'X-SMTPAPI' => @header.as_json
-
end
-
end
-
1
class AmazonAPIResponse < ActiveRecord::Base
-
# Attributes
-
-
1
attr_accessible :warehouse_book_version, :warehouse_book_version_id
-
-
# Associations
-
-
1
belongs_to :warehouse_book_version, inverse_of: :amazon_api_response
-
1
has_many :amazon_api_response_items, inverse_of: :amazon_api_response, dependent: :destroy
-
7
has_many :amazon_api_lookup_response_items, lambda {where(query_type: 'lookup').order(:response_rank)}, class_name: 'AmazonAPIResponseItem'
-
8
has_many :amazon_api_search_response_items, lambda {where(query_type: 'search').order(:response_rank)}, class_name: 'AmazonAPIResponseItem'
-
-
# Validations
-
-
1
validates_presence_of :warehouse_book_version_id
-
-
# Miscellaneous
-
-
1
def matching_response_item
-
7
if amazon_api_search_response_items.count == 1
-
1
return_val = amazon_api_search_response_items.first
-
1
Rails.logger.tagged('inexplicable') {Rails.logger.info "DUMB (search) #{amazon_api_search_response_items.count} / #{return_val} / #{amazon_api_response_items.count}"} if return_val == nil
-
1
return_val
-
6
elsif amazon_api_lookup_response_items.count == 1
-
1
return_val = amazon_api_lookup_response_items.first
-
1
Rails.logger.tagged('inexplicable') {Rails.logger.info "DUMB (lookup) #{amazon_api_lookup_response_items.count} / #{return_val} / #{amazon_api_response_items.count}"} if return_val == nil
-
1
return_val
-
5
elsif amazon_api_response_items.count > 1
-
4
TrackedBookVersion.where(warehouse_book_version_id: warehouse_book_version.id).each do |tracked_book_version|
-
1
if tracked_book_version.metadata.present?
-
1
metadata_asin = tracked_book_version.metadata[0] if tracked_book_version.metadata[0].present? && tracked_book_version.metadata[0].length == 10
-
1
if metadata_asin.present?
-
1
amazon_api_response_items.each do |response_item|
-
2
if response_item.asin == metadata_asin
-
1
return response_item
-
end
-
end
-
end
-
end
-
end
-
-
# For all top 100 ingestions (old style) with an isbn13 and am ambiguous result, select the one that matches
-
# the asin to its isbn13 converted to isbn10
-
3
if warehouse_book_version.source.present? && warehouse_book_version.source.include?('amazon-top100-') && warehouse_book_version.isbn13.present?
-
1
amazon_api_response_items.each do |response_item|
-
2
if response_item.asin == ISBN_Tools.isbn13_to_isbn10(warehouse_book_version.isbn13)
-
1
return response_item
-
end
-
end
-
end
-
2
:ambiguous_results
-
else
-
1
:no_results
-
end
-
end
-
end
-
1
class AmazonAPIResponseItem < ActiveRecord::Base
-
# Attributes
-
-
1
QUERY_TYPES = [:lookup, :search].freeze
-
-
1
attr_accessible :amazon_api_response, :amazon_api_response_id, :asin, :author, :binding, :brand, :creator, :ean, :ean_list_element, :eisbn,
-
:isbn, :item_dimensions_height, :item_dimensions_height_unit, :item_dimensions_length, :item_dimensions_length_unit,
-
:item_dimensions_weight, :item_dimensions_weight_unit, :item_dimensions_width, :item_dimensions_width_unit,
-
:label, :large_image_url, :list_price_amount, :list_price_currency_code, :manufacturer, :medium_image_url,
-
:number_of_pages, :package_dimensions_height, :package_dimensions_height_unit, :package_dimensions_length,
-
:package_dimensions_length_unit, :package_dimensions_weight, :package_dimensions_weight_unit, :package_dimensions_width,
-
:package_dimensions_width_unit, :publication_date, :publisher, :sales_rank, :small_image_url, :studio,
-
:title, :response_rank, :query_type
-
-
# Associations
-
-
1
belongs_to :amazon_api_response, inverse_of: :amazon_api_response_items, touch: true
-
-
# Validations
-
-
1
validates_presence_of :amazon_api_response_id, :response_rank, :query_type
-
1
validates_uniqueness_of :response_rank, scope: [:amazon_api_response_id, :query_type]
-
1
validates_inclusion_of :query_type, in: QUERY_TYPES + QUERY_TYPES.collect(&:to_s)
-
-
# Miscellaneous
-
-
1
def isbn13
-
4
return eisbn if eisbn.present?
-
3
return ean if ean.present?
-
2
ean_list_element if ean_list_element.present?
-
end
-
-
1
def author_name
-
3
author.present? ? author : creator
-
end
-
-
1
def physical_details
-
1
if package_dimensions_length.present? && package_dimensions_width.present? && package_dimensions_height.present? && package_dimensions_weight.present?
-
1
"#{package_dimensions_length / 100.0} x #{package_dimensions_width / 100.0} x #{package_dimensions_height / 100.0} inches. #{package_dimensions_weight / 100.0} pounds."
-
end
-
end
-
-
1
def title
-
27
read_attribute(:title).try(:gsub, /\n/, ' ')
-
end
-
-
1
def get_binding
-
1
read_attribute(:binding)
-
end
-
end
-
1
class AmazonAuthorPage < AmazonPage
-
1
def self.by_asin_and_tld(author_asin, tld)
-
1
new Urls.amazon_author_page(author_asin, tld)
-
end
-
-
1
def scrape_also_bought_items_by
-
2
@page.css('#entitySimsTable td a').collect(&:text).presence if @page.css('#entitySimsTable td a').present?
-
end
-
end
-
1
class AmazonBestSellersPage < AmazonPage
-
1
ACCEPTABLE_BASE_CATEGORIES = ['Books', 'Kindle Store > Kindle eBooks', 'Kindle Store > Kindle Singles', 'Kindle Store > Books'].freeze
-
-
1
def self.by_category_id_and_tld_and_base_category_and_page_number(category_id, tld, base_category, page_number)
-
11
new Urls.amazon_book_category_page(category_id, tld, base_category, page_number)
-
end
-
-
1
def self.is_acceptable_amazon_category_name?(category_name)
-
14
category_name.present? && ACCEPTABLE_BASE_CATEGORIES.any? {|acceptable_category| acceptable_category == category_name || category_name.starts_with?("#{acceptable_category} > ")}
-
end
-
-
1
def best_sellers_stats
-
2
return [] if scrape_best_sellers.blank?
-
-
2
scrape_best_sellers.collect do |element|
-
{rank: get_rank(element),
-
days_in_top_100: get_days_in_top_100(element),
-
trend: get_trend(element),
-
title: get_title(element),
-
asin: get_asin(element),
-
author: get_author_name(element),
-
star_rating: get_star_rating(element),
-
rating_count: get_rating_count(element),
-
list_price: get_list_price(element),
-
40
price: get_price(element)}
-
end
-
end
-
-
1
def scrape_category_name
-
21
leaf = @page.css('#zg_browseRoot .zg_selected').first
-
21
if leaf.present?
-
21
categories = @page.css('#zg_browseRoot ul > li:first-child *:first-child').to_a
-
21
categories = categories[0, categories.size - 1] || []
-
21
categories << leaf
-
63
categories.uniq.collect {|category| category.text.squish}.join(' > ')
-
end
-
end
-
-
1
def scrape_subcategories
-
1
if @page.css('#zg_browseRoot .zg_selected').present? && @page.css('#zg_browseRoot .zg_selected').first.parent.parent.css('ul').present?
-
1
@page.css('#zg_browseRoot .zg_selected').first.parent.parent.css('li a').each_with_object([]) do |element, arr|
-
20
subcategory_name = "#{scrape_category_name} > #{element.text.squish}"
-
20
subcategory_id = element.attributes['href'].present? ? ScraperUtilities.get_category_id_from_url(element.attributes['href'].text.squish) : nil
-
20
arr << {category_name: subcategory_name, category_id: subcategory_id}
-
end
-
end
-
end
-
-
1
def scrape_number_of_pages
-
1
@page.css('.zg_pagination .zg_page').count
-
end
-
-
1
def scrape_best_sellers_1
-
4
@page.css('.zg_itemImmersion').presence
-
end
-
-
1
def scrape_best_sellers_2
-
@page.css('.zg_itemRow > :first-child').presence
-
end
-
-
1
def scrape_no_best_sellers?
-
2
@page.css('.zg_infoMessage').present?
-
end
-
-
1
def get_rank(element)
-
40
element.css('.zg_rankNumber').first.text.squish.gsub('.', '') if element.css('.zg_rankNumber').first.present?
-
end
-
-
1
def get_days_in_top_100(element)
-
40
days_in_top_100_element = element.css('.zg_rankMeta').first || element.css('.zg_daysInList').first
-
40
days_in_top_100_element.text.squish.gsub(/ days? in the top 100/, '') if days_in_top_100_element.present?
-
end
-
-
1
def get_trend(element)
-
40
if element.css('.zg_arrowUp').present?
-
14
'Up'
-
26
elsif element.css('.zg_arrowDown').present?
-
10
'Down'
-
else
-
16
'Steady'
-
end
-
end
-
-
1
def get_title(element)
-
40
title_element1 = element.css('.zg_title a').first
-
40
title_element2 = element.css('.zg_itemImage_compact img').first
-
-
40
(title_element1.try(:text).try(:squish) if title_element1.present?) ||
-
40
(title_element2.attributes['title'].try(:text).try(:squish) if title_element2.present?) ||
-
(title_element2.attributes['alt'].try(:text).try(:squish) if title_element2.present?)
-
end
-
-
1
def get_asin(element)
-
40
title_element1 = element.css('.zg_title a').first
-
40
asin_element1 = element.css('.zg_itemImage_compact a').first
-
40
asin_element2 = element.css('.asinReviewsSummary').first
-
40
asin_element3 = element.css('.crAvgStars > a').first
-
40
asin_element4 = asin_element2.present? ? asin_element2.css('a').first : nil
-
-
40
(ScraperUtilities.extract_asin_from_url(title_element1.attributes['href'].try(:text)) if title_element1.present?) ||
-
40
(ScraperUtilities.extract_asin_from_url(asin_element1.attributes['href'].try(:text)) if asin_element1.present?) ||
-
(asin_element2.attributes['name'].try(:text) if asin_element2.present?) ||
-
(ScraperUtilities.extract_asin_from_url(asin_element3.attributes['href'].try(:text)) if asin_element3.present?) ||
-
(ScraperUtilities.extract_asin_from_url(asin_element4.attributes['href'].try(:text)) if asin_element4.present?)
-
end
-
-
1
def get_author_name(element)
-
40
element.css('.zg_byline').first.text.gsub(/^by /, '').squish if element.css('.zg_byline').present?
-
end
-
-
1
def get_star_rating(element)
-
40
element.css('.swSprite:not(.s_primeBadge)').first.attributes['title'].text.squish.gsub(' out of 5 stars', '') if element.css('.swSprite:not(.s_primeBadge)').present? && element.css('.swSprite:not(.s_primeBadge)').first.attributes['title'].text.present?
-
end
-
-
1
def get_rating_count(element)
-
40
element.css('.crAvgStars > a').first.text.squish.gsub(',', '') if element.css('.crAvgStars > a').present?
-
end
-
-
1
def get_list_price(element)
-
40
element.css('.listprice').first.text.squish.gsub(/\$|\./, '') if element.css('.listprice').present? && element.css('.listprice').first.text.present?
-
end
-
-
1
def get_price(element)
-
40
element.css('.price').first.text.squish.gsub(/\$|\./, '') if element.css('.price').present? && element.css('.price').first.text.present?
-
end
-
end
-
1
class AmazonCategoryCollection < Mongo::Collection
-
1
def initialize(opts = {})
-
24
super 'amazon_categories', $mongodb, opts
-
end
-
-
1
def prepare_for_scraping
-
# Clear the collection
-
2
drop
-
-
# Add these to the canonical categories since we don't scrape them, but do associate them with warehouse_stats and warehouse_list_stats
-
2
add_category_details nil, 'Kindle Store', '.com', :canonical
-
2
add_category_details nil, 'Kindle Store', '.co.uk', :canonical
-
end
-
-
1
def add_category_details(category_id, category_name, tld, status)
-
20
insert category_id: category_id, category_name: category_name, tld: tld, status: status
-
end
-
end
-
1
class AmazonKindleDailyDealsPage < AmazonPage
-
1
def initialize(user_agent = 'Windows Mozilla')
-
7
super 'http://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000677541', user_agent
-
end
-
-
# If there are multiple Daily Kindle Deals, combine them with the on page deals to get all on page deals
-
# Otherwise, return just the on page deals which includes the single Daily Kindle Deal
-
1
def deals
-
2
multiple_kindle_daily_deal.present? ? on_page_daily_deals + multiple_kindle_daily_deal : on_page_daily_deals
-
end
-
-
# Returns the Daily Kindle Deals if more than one exists
-
1
def multiple_kindle_daily_deal
-
3
page_banner = @page.css('.amabot_center > .pageBanner').first
-
3
first_deals_element = page_banner.next_element
-
3
if first_deals_element.attributes.present?
-
10
scrape_details_from_widget(first_deals_element).each_with_index.collect {|details, index| details.merge daily_deal_type: 'The Kindle Daily Deal', rank: index + 1}
-
end
-
end
-
-
# Returns all daily deals that have non-search page urls, means there is only one book for that daily deal category
-
# Will return the Daily Kindle Deal if it only has one book, will not if it has more than one book
-
1
def on_page_daily_deals
-
11
all_daily_deals.reject {|details| URI.parse(URI.encode(details[:url])).path.start_with? '/s/'}
-
end
-
-
# Returns daily deals that have a search page url, means the daily deal category has multiple books
-
1
def search_page_daily_deals
-
all_daily_deals.select {|details| URI.parse(URI.encode(details[:url])).path.start_with? '/s/'}
-
end
-
-
# Does not include multiple book Daily Kindle Deals, despite name
-
1
def all_daily_deals
-
2
deals = []
-
-
2
@page.css('.amabot_center > div > table > tr:nth-child(odd) > td:nth-child(odd) > b').each_with_index do |element, index|
-
9
deals[index] ||= {}
-
9
deals[index][:daily_deal_type] = element.text.squish
-
end
-
2
@page.css('.amabot_center > div > table > tr:nth-child(even) > td:not(:empty)').each_with_index do |element, index|
-
9
deals[index] ||= {}
-
9
if element.css('p a').present? && element.css('p a').first.attributes['href'].present?
-
9
deals[index][:asin] = ScraperUtilities.extract_asin_from_url(element.css('p a').first.attributes['href'].value)
-
9
deals[index][:url] = force_absolute_url(element.css('p a').first.attributes['href'].value)
-
end
-
9
deals[index][:author_name] = element.css('p').first.text.squish.gsub(/^by /, '') if element.css('p').present?
-
9
if element.css('b i').present?
-
7
deals[index][:title] = element.css('b i').first.text.squish
-
7
deals[index][:description] = element.css('p')[2].text.squish if element.css('p')[2].present?
-
7
deals[index][:price] = ScraperUtilities.cleanse_price(element.css('.price').text) if element.css('.price').present?
-
else
-
2
deals[index][:description] = element.children[1].text.squish if element.children[1].present?
-
2
deals[index][:price] = ScraperUtilities.cleanse_price(element.css('p')[1].text) if element.css('p')[1].present?
-
end
-
end
-
-
2
deals
-
end
-
end
-
1
class AmazonKindleMonthlyDealsPage < AmazonPage
-
1
def initialize(user_agent = 'Windows Mozilla')
-
6
super 'http://www.amazon.com/b/ref=amb_link_380698542_1?ie=UTF8&node=3441883011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=center-1&pf_rd_r=0B9969RWA42TMTQG751R&pf_rd_t=1401&pf_rd_p=1590374862&pf_rd_i=1000706171', user_agent
-
end
-
-
1
def deals
-
@page.css('.amabot_center .widget').collect do |widget|
-
6
header_text = widget.css('h2').text.squish
-
6
deal_details = scrape_details_from_widget(widget)
-
53
deal_details.each_with_index.collect {|details, index| details.merge daily_deal_type: "Carousel - #{header_text}", rank: index + 1} if deal_details.present?
-
1
end.flatten.compact
-
end
-
-
1
def category_urls
-
@page.css('#leftNav .left_nav ul li a').collect do |anchor|
-
force_absolute_url anchor['href']
-
end
-
end
-
end
-
1
class AmazonKindleSelectPage < AmazonPage
-
1
def initialize(user_agent = 'Windows Mozilla')
-
4
super 'http://www.amazon.com/gp/feature.html?ie=UTF8&docId=1001298091', user_agent
-
end
-
-
1
def all_select_titles
-
1
titles = []
-
1
@page.css('.amabot_center > div > table:nth-of-type(1) > tr:nth-child(odd) > td:nth-child(odd) > b').each_with_index do |element, index|
-
6
titles[index] ||= {}
-
6
titles[index][:rank] = element.children[0].text.gsub(/\D/, '').to_i if element.children[0].present?
-
6
titles[index][:title] = element.children[1].text.squish if element.children[1].present?
-
6
titles[index][:author_name] = element.children[2].text.squish.gsub(/^by /, '') if element.children[2].present?
-
end
-
1
@page.css('.amabot_center > div > table:nth-of-type(1) > tr:nth-child(even) > td:nth-child(odd)').each_with_index do |element, index|
-
6
titles[index] ||= {}
-
6
if element.css('a').present? && element.css('a').first.attributes['href'].present?
-
6
titles[index][:asin] = ScraperUtilities.extract_asin_from_url(element.css('a').first.attributes['href'].value)
-
6
titles[index][:url] = force_absolute_url(element.css('a').first.attributes['href'].value)
-
end
-
6
titles[index][:price] = ScraperUtilities.cleanse_price(element.css('.price').text) if element.css('.price').present?
-
end
-
-
1
rank = titles.count
-
-
1
titles += scrape_details_from_widget(@page.css('.amabot_center .s9Widget').first).each_with_index.collect do |details, index|
-
19
details.merge rank: rank + index + 1
-
end
-
-
26
titles.each {|title| title.merge! daily_deal_type: 'Kindle Select 25'}
-
end
-
end
-
1
class AmazonPage < Page
-
1
def initialize(url, user_agent = 'Windows Mozilla')
-
184
super url, user_agent
-
184
@asin = ScraperUtilities.extract_asin_from_url url
-
end
-
-
1
def captcha?
-
26
@page.css('input#captchacharacters').present?
-
end
-
-
1
def scrape_details_from_widget(widget)
-
9
deals = []
-
-
9
widget.css('div.s9hl').each do |element|
-
74
title_data = {}
-
74
if element.css('.title').present?
-
74
title_data[:asin] = ScraperUtilities.extract_asin_from_url(element.css('.title').first.attributes['href'].value)
-
74
title_data[:url] = force_absolute_url(element.css('.title').first.attributes['href'].value)
-
74
title_data[:title] = element.css('.title').first.attributes['title'].value.squish
-
end
-
74
title_data[:author_name] = element.css('div.t11').first.text.gsub('›', '').squish if element.css('div.t11').present?
-
74
title_data[:price] = ScraperUtilities.cleanse_price(element.css('.s9Price').text) if element.css('.s9Price').present?
-
74
title_data[:title] = element.css('.s9TitleText').first.text.squish if title_data[:title].blank? && element.css('.s9TitleText').present?
-
74
deals << title_data
-
end
-
-
9
deals
-
end
-
end
-
1
class AmazonProductPage < AmazonPage
-
1
EMPTY_RATING_HISTOGRAM = {five_star_count: '0', four_star_count: '0', three_star_count: '0', two_star_count: '0', one_star_count: '0'}.with_indifferent_access.freeze
-
1
def self.by_asin_and_tld(asin, tld)
-
100
new Urls.amazon_book_page(asin, tld)
-
end
-
-
1
def book_image_exists?
-
# '#prodImageCell a img' is the old scrape, doesnt seem to exist anymore but here for completeness
-
# '#main-image' is the newer scrape and should cover all other cases
-
# '#imgBlkFront' indicates the new page format and if its the correct style it means the no image image was found
-
3
@page.css('#prodImageCell a img').present? || @page.css('#main-image').present? ||
-
1
(@page.css('#imgBlkFront').present? && @page.css('#imgBlkFront').first['style'] != 'max-width:60px; max-height:40px;')
-
end
-
-
1
def buy_button_exists?
-
# 'input#buyButton' is for 1-click ebook and pre order buy buttons, 'input#buyButton' for non 1 click e book buy buttons
-
# 'span#addToCartSpan' for all physical buy and pre order buttons
-
# "//input[@name='submit.add-to-cart']" indicates the new page type buy button (both normal and pre-order)
-
2
@page.css('input#buyButton').present? || @page.css('button#buyButton').present? ||
-
@page.css('span#addToCartSpan').present? || @page.xpath("//input[@name='submit.add-to-cart']").present?
-
end
-
-
1
def physical_details
-
3
scrape_weight.blank? ? scrape_dimensions : "#{scrape_dimensions}. #{scrape_weight}."
-
end
-
-
1
def scrape_star_rating_distribution_1
-
3
EMPTY_RATING_HISTOGRAM if @page.css('#emptyHistogram').present?
-
end
-
-
1
def scrape_star_rating_distribution_2
-
3
distribution = %w[five four three two one].each_with_object(HashWithIndifferentAccess.new) do |num, hash|
-
15
count_text = @page.css("div.histoRow#{num} div.histoCount").text
-
15
hash["#{num}_star_count"] = count_text.present? ? count_text.gsub(',', '') : '0'
-
end
-
-
3
distribution unless distribution == EMPTY_RATING_HISTOGRAM
-
end
-
-
1
def scrape_star_rating_distribution_3
-
3
@page.css('#histogramTable .a-histogram-row').each_with_object(HashWithIndifferentAccess.new) do |row, hash|
-
15
number = row.css('td:first').text.scan(/\d+/).join('')
-
15
number_as_word = case number
-
when '1'
-
3
'one'
-
when '2'
-
3
'two'
-
when '3'
-
3
'three'
-
when '4'
-
3
'four'
-
when '5'
-
3
'five'
-
end
-
15
hash["#{number_as_word}_star_count"] = row.css('td:last').text.scan(/\d+/).join('')
-
end
-
end
-
-
1
def scrape_likes
-
2
like_count_element = @page.css('span.amazonLikeCount').first
-
2
like_count_element.text.gsub(',', '').strip if like_count_element.present?
-
end
-
-
1
def scrape_sales_rank_1
-
7
sales_rank_element = @page.xpath(".//li[@id='SalesRank']/b").first
-
7
sales_rank_element.next_sibling.text.scan(/\d/).join if sales_rank_element.present? && sales_rank_element.next_sibling.present?
-
end
-
-
1
def scrape_sales_rank_2
-
@page.css('#amazon-sales-rank-detail .a-span9 span span:first').text.split(' in ').first.scan(/\d/).join if @page.css('#amazon-sales-rank-detail .a-span9 span span:first').present?
-
end
-
-
1
def sales_rank_category_element
-
11
@page.xpath(".//li[@id='SalesRank']/b").first.next_sibling if @page.xpath(".//li[@id='SalesRank']/b").first.present?
-
end
-
-
1
def scrape_sales_rank_category_1
-
2
sales_rank_category_element.text.split('in', 2)[1].gsub('(','').squish if sales_rank_category_element.present? && sales_rank_category_element.text.include?('Kindle')
-
end
-
-
1
def scrape_sales_rank_category_2
-
2
sales_rank_category_element.text.split('in')[1].split(' ').first.gsub(',', '') if sales_rank_category_element.present? && sales_rank_category_element.text.exclude?('Kindle')
-
end
-
-
1
def scrape_sales_rank_category_3
-
@page.css('#amazon-sales-rank-detail .a-span9 span span:first').text.scan(/[a-zA-Z]| /).join('').squish.gsub(/^in /, '') if @page.css('#amazon-sales-rank-detail .a-span9 span span:first').present?
-
end
-
-
1
def scrape_sub_categories_and_ranks_1
-
6
@page.css('#SalesRank .zg_hrsr_item').each_with_object([]) do |element, arr|
-
13
rank_element = element.css('.zg_hrsr_rank').first
-
13
category_element = element.css('.zg_hrsr_ladder').first
-
13
category_id_element = element.css('.zg_hrsr_ladder b a').first
-
-
# gsub '#' out for US/UK and 'Nr. ' out for DE
-
arr << {
-
13
rank: (rank_element.text.gsub('#', '').gsub('Nr. ','').squish if rank_element.present?),
-
13
category: (category_element.text.gsub(/\u00A0/, ' ').gsub(/^in /, '').squish if category_element.present?),
-
13
category_id: (ScraperUtilities.get_category_id_from_url(category_id_element.attributes['href'].text.squish) if category_id_element.present?)
-
13
}.with_indifferent_access
-
end
-
end
-
-
1
def scrape_sub_categories_and_ranks_2
-
spans = @page.css('#amazon-sales-rank-detail .a-span9 span span')
-
spans.shift
-
spans.each_with_object([]) do |element, arr|
-
arr << {
-
rank: (element.text.split(' in ').first.scan(/\d/).join),
-
category: (element.text.split(' in ').last),
-
category_id: (ScraperUtilities.get_category_id_from_url(element.css('a').first.attr('href')) if element.css('a').first.present?)
-
}.with_indifferent_access
-
end
-
end
-
-
1
def scrape_book_format_1
-
32
@page.css('#btAsinTitle').text.scan(/\[(.+?)\]/).last.first if @page.css('#btAsinTitle').present? && @page.css('#btAsinTitle').text.scan(/\[(.+?)\]/).present?
-
end
-
-
1
def scrape_book_format_2
-
1
@page.css('h1#title span').first.inner_text.strip if @page.css('h1#title span').present?
-
end
-
-
1
def scrape_book_format_3
-
@page.css('#formats span.a-declarative .a-span4 > span').text.strip if @page.css('#formats span.a-declarative .a-span4 > span').present?
-
end
-
-
1
def scrape_also_boughts
-
@page.css('#purchaseButtonWrapper .shoveler-content li').each_with_object([]) do |element, array|
-
12
title_link = element.css('a.sim-img-title').first
-
12
if title_link.present?
-
12
title_element = element.css('a.sim-img-title span').first
-
12
author_element = element.css('.byline, .shvl-byline').first
-
12
star_count_element = element.css('.auiTestSprite').first
-
12
rating_count_element = element.css('.crAvgStars > a').first
-
12
price_element = element.css('.price').first
-
12
array << {title: ScraperUtilities.cleanse_string((title_element.present? && title_element.attributes['title'].present? ? title_element.attributes['title'].text.strip : title_link.text.strip)),
-
isbn_or_asin: ScraperUtilities.extract_asin_from_url(title_link.attributes['href'].text),
-
447
author: (author_element.text.chars.select{|i| i.valid_encoding?}.join.gsub('by ', '').gsub(/[^0-9a-z ]/i, '').strip if author_element.present?),
-
12
star_count: (star_count_element.attributes['title'].text.strip if star_count_element.present? && star_count_element.attributes['title'].present?),
-
12
rating_count: (rating_count_element.text.strip.gsub(',', '') if rating_count_element.present?),
-
24
price: (ScraperUtilities.cleanse_price(price_element.text.strip) if price_element.present?)}.with_indifferent_access
-
end
-
2
end.presence
-
end
-
-
1
def scrape_bought_after_viewing
-
2
bought_after_viewing = []
-
2
bought_after_viewing_element = @page.xpath("//*[.='What Other Items Do Customers Buy After Viewing This Item?']").first
-
2
if bought_after_viewing_element.present?
-
2
bought_after_viewing = bought_after_viewing_element.parent.css('.asinDetails').each_with_object([]) do |element, array|
-
8
title_element = element.css('.cpAsinTitle').first
-
8
if title_element.present?
-
8
isbn_or_asin_element = element.css('a').first
-
8
author_element = element.css('.vtp-byline-text').first
-
8
star_count_element = element.css('.swSprite').first
-
8
rating_count_element = element.css('.crAvgStars > a').first
-
8
price_element = element.css('.price').first
-
array << {title: ScraperUtilities.cleanse_string(title_element.text.strip),
-
8
isbn_or_asin: (ScraperUtilities.extract_asin_from_url(isbn_or_asin_element.attributes['href'].text) if isbn_or_asin_element.present?),
-
8
author: (author_element.text.strip.gsub('by ', '') if author_element.present?),
-
8
star_count: (star_count_element.attributes['title'].text.strip if star_count_element.present?),
-
8
rating_count: (rating_count_element.text.strip.gsub(',', '') if rating_count_element.present?),
-
16
price: (ScraperUtilities.cleanse_price(price_element.text.strip) if price_element.present?)}.with_indifferent_access
-
end
-
end
-
end
-
-
2
bought_after_viewing.presence
-
end
-
-
1
def scrape_frequently_bought_together_1
-
@page.css('#fbt_item_details li').each_with_object([]) do |element, array|
-
6
title_element = element.css('label > span, span > a').first
-
6
if title_element.present?
-
6
author_element = element.css('.bxgy-byline-text').first
-
6
price_element = element.css('.bxgy-item-price').first
-
6
type_element = element.css('.bxgy-binding-byline').first
-
6
type_text_element = element.css('.bxgy-byline-text').first
-
array << {title: ScraperUtilities.cleanse_string(title_element.text.squish),
-
6
author: (author_element.text.squish.gsub('by ', '') if author_element.present?),
-
6
price: (ScraperUtilities.cleanse_price(price_element.text.squish) if price_element.present?),
-
12
type: (type_element.text.gsub(type_text_element.text, '').squish if type_element.present? && type_text_element.present?)}.with_indifferent_access
-
end
-
2
end.presence
-
end
-
-
1
def scrape_frequently_bought_together_2
-
bought_together_form = @page.css('#AutoBuyXGetY form').first
-
title_element = @page.css('.bxgy-text a').first
-
if bought_together_form.present? && title_element.present?
-
author_element = bought_together_form.css('.bxgy-byline-text').first
-
price_element = bought_together_form.css('.bxgy-item-price').first
-
type_element = bought_together_form.css('.bxgy-binding-byline').first
-
type_text_element = bought_together_form.css('.bxgy-byline-text').first
-
[{title: ScraperUtilities.cleanse_string(title_element.text.squish),
-
author: (author_element.text.squish.gsub('by ', '') if author_element.present?),
-
price: (ScraperUtilities.cleanse_price(price_element.text.squish) if price_element.present?),
-
type: (type_element.text.gsub(type_text_element.text, '').squish if type_element.present? && type_text_element.present?)}.with_indifferent_access]
-
end
-
end
-
-
1
def scrape_similar_items_by_category
-
2
similar_items_element = @page.xpath("//*[.='Look for Similar Items by Category']").first
-
2
if similar_items_element.present? && similar_items_element.parent.css('li').present?
-
1
similar_items_element.parent.css('li').collect(&:text).compact.presence
-
end
-
end
-
-
1
def scrape_similar_items_by_category_id
-
1
similar_items_element = @page.xpath("//*[.='Look for Similar Items by Category']").first
-
1
if similar_items_element.present? && similar_items_element.parent.css('li a:last-of-type').present?
-
similar_items_element.parent.css('li a:last-of-type').collect {|element| CGI::parse(URI::parse(element['href']).query)['node'] if element['href'].present?}.flatten.compact.presence
-
end
-
end
-
-
1
def scrape_related_format_data_1
-
7
contains_currency_condition = ScraperUtilities::CURRENCY_CONDITION
-
%w[paperback hardcover mass_market_paperback kindle].each_with_object({}) do |format, hash|
-
28
@page.xpath(".//tbody[@id='#{format}_meta_binding_winner']").xpath("(.//td[@class=' price ' and #{contains_currency_condition}]|.//span[@class='price' and #{contains_currency_condition}])").each do |element|
-
12
tr = element.xpath(".//ancestor::tr[@class='bucketBorderTop'][1]").first
-
12
if tr.present?
-
7
format_title = tr.css('td.tmm_bookTitle').first.text.squish
-
#.co.uk DOM doesn't specify a specific mass_market_paperback tbody so we find it in paperback and set the format accordingly
-
7
format_title = ScraperUtilities.coerce_amazon_format(format_title)
-
7
hash[format_title] = {price: ScraperUtilities.cleanse_price(element.text.strip), asin: tr['id'].gsub('tmm_','')}
-
7
break
-
end
-
end
-
7
end.presence
-
end
-
-
#TODO Test
-
1
def scrape_related_format_data_2
-
@page.css('#formats #twister span.a-declarative').each_with_object({}) do |element, hash|
-
8
if element.css('th a').present? && element.css('td')[1].present? && (element.css('td')[1].text.strip.include?('$') || element.css('td')[1].text.strip.include?('£'))
-
format_title = element.css('th').text.strip
-
-
acceptable_formats = %w[Kindle Hardcover Paperback]
-
if acceptable_formats.any? {|format| format_title.include?(format)}
-
format_title = ScraperUtilities.coerce_amazon_format(format_title)
-
-
hash[format_title] = {price: ScraperUtilities.cleanse_price(element.css('td')[1].text.strip),
-
asin: (ScraperUtilities.extract_asin_from_url(element.css('th a').first['href']) if element.css('th a').present?)}
-
end
-
end
-
2
end.presence
-
end
-
-
#TODO Test
-
1
def scrape_related_format_data_3
-
@page.css('#formats div.top-level').each_with_object({}) do |element, hash|
-
8
if element.css('td a').present? && element.css('td a')[1].present? && (element.css('td')[1].text.strip.include?('$') || element.css('td')[1].text.strip.include?('£'))
-
8
format_title = element.css('td a')[1].text.squish
-
8
acceptable_formats = %w[Kindle Hardcover Paperback]
-
28
if acceptable_formats.any? {|format| format_title.include?(format)}
-
4
format_title = ScraperUtilities.coerce_amazon_format(format_title)
-
-
4
hash[format_title] = {price: ScraperUtilities.cleanse_price(element.css('td')[1].text.strip),
-
4
asin: (ScraperUtilities.extract_asin_from_url(element.css('td a')[1]['href']) if element.css('td a')[1].present?)}
-
end
-
end
-
2
end.presence
-
end
-
-
1
def scrape_amazon_average_rating_1
-
2
@page.css('.reviews .acrRating').text[0, 3] if @page.css('.reviews .acrRating').present?
-
end
-
-
1
def scrape_amazon_average_rating_2
-
2
average_rating_element = @page.xpath(".//div[@class='jumpBar']").css('span.asinReviewsSummary').first
-
2
average_rating_element.search('.//span').first['title'][0, 3] unless average_rating_element.blank?
-
end
-
-
1
def scrape_amazon_average_rating_3
-
2
average_rating_element = @page.xpath(".//div[@class='buying']").css('span.asinReviewsSummary').first
-
2
average_rating_element.search('.//span').first['title'][0, 3] unless average_rating_element.blank?
-
end
-
-
1
def scrape_amazon_average_rating_4
-
2
@page.css('#reviewContainer #avgRating span').text.squish[0, 3] if @page.css('#reviewContainer #avgRating span').present?
-
end
-
-
1
def scrape_amazon_average_rating_5
-
1
@page.css('.reviewCountTextLinkedHistogram').first['title'][0, 3] if @page.css('.reviewCountTextLinkedHistogram').present?
-
end
-
-
1
def scrape_amazon_review_count_1
-
2
average_rating_element = @page.xpath(".//div[@class='jumpBar']").css('span.crAvgStars')
-
2
average_rating_element.search('.//a')[2].text.split(' ')[0].gsub(',','') unless average_rating_element.blank? || average_rating_element.search('.//a')[2].nil?
-
end
-
-
1
def scrape_amazon_review_count_2
-
2
average_rating_element = @page.xpath(".//div[@class='buying']").css('span.crAvgStars')
-
2
average_rating_element.search('.//a')[2].text.split(' ')[0].gsub(',','') unless average_rating_element.blank? || average_rating_element.search('.//a')[2].nil?
-
end
-
-
1
def scrape_amazon_review_count_3
-
2
@page.css('#reviewContainer #summaryStars').text.squish.gsub(/\(|\)|,/, '') if @page.css('#reviewContainer #summaryStars').present?
-
end
-
-
1
def scrape_amazon_review_count_4
-
3
@page.css('#averageCustomerReviews').text.squish.gsub(/\D/, '').presence || '0' if @page.css('#averageCustomerReviews').present?
-
end
-
-
1
def large_price_element
-
6
@page.xpath(".//table[@class='product']//b[@class='priceLarge'] | .//table[@class='product']//span[@class='priceLarge'] | .//table[@class='product ']//b[@class='priceLarge'] | .//table[@class='product ']//span[@class='priceLarge']")
-
end
-
-
1
def scrape_amazon_price_1
-
2
if large_price_element.present? && large_price_element.last.parent.previous_element.present? && large_price_element.last.parent.previous_element.text.strip.include?('Prime')
-
ScraperUtilities.cleanse_price(large_price_element.first.text.strip).presence
-
end
-
end
-
-
1
def scrape_amazon_price_2
-
2
if large_price_element.count > 1 && large_price_element.first.parent.parent.children.css('.productBlockLabel').first.text.include?('Rent')
-
ScraperUtilities.cleanse_price(large_price_element.last.text.strip).presence
-
end
-
end
-
-
1
def scrape_amazon_price_3
-
2
ScraperUtilities.cleanse_price(large_price_element.text.strip).presence if large_price_element.present?
-
end
-
-
1
def scrape_amazon_price_4
-
2
ScraperUtilities.cleanse_price(@page.css('#buyNewSection span.offer-price').text.strip) if @page.css('#buyNewSection span.offer-price').present?
-
end
-
-
1
def scrape_amazon_price_5
-
ScraperUtilities.cleanse_price(@page.css('#rentalPriceBlockGrid .rentPrice').first.text.strip) if @page.css('#rentalPriceBlockGrid .rentPrice').present?
-
end
-
-
1
def scrape_digital_list_price_1
-
7
element = @page.css('table.product .productBlockLabel').select {|block_label_element| block_label_element.text.strip.gsub(':', '').downcase == 'digital list price'}.first
-
3
ScraperUtilities.cleanse_price(element.parent.css('.listprice, .listPrice').text.strip) if element.present?
-
end
-
-
1
def scrape_digital_list_price_2
-
3
ScraperUtilities.cleanse_price(@page.css('.digitalListPrice .listprice').text.strip) if @page.css('.digitalListPrice .listprice').present?
-
end
-
-
1
def scrape_amazon_list_price_1
-
19
element = @page.css('table.product .productBlockLabel').select {|block_label_element| block_label_element.text.strip.gsub(':', '').downcase == 'print list price'}.first
-
7
ScraperUtilities.cleanse_price(element.parent.css('.listprice, .listPrice').text.strip) if element.present?
-
end
-
-
1
def scrape_amazon_list_price_2
-
5
ScraperUtilities.cleanse_price(@page.css('table.product span.listprice').text.strip) if @page.css('table.product span.listprice').present? && @page.css('table.product span.listprice').first.parent['class'] != 'digitalListPrice'
-
end
-
-
1
def scrape_amazon_list_price_3
-
4
ScraperUtilities.cleanse_price(@page.css('table.product td.listPrice').text.strip) if @page.css('table.product td.listPrice').present?
-
end
-
-
1
def scrape_amazon_list_price_4
-
5
ScraperUtilities.cleanse_price(@page.css('#buyBoxInner span.a-text-strike').text.strip) if @page.css('#buyBoxInner span.a-text-strike').present?
-
end
-
-
1
def scrape_amazon_list_price_5
-
1
ScraperUtilities.cleanse_price(@page.css('table.product td.listprice').text.strip) if @page.css('table.product td.listprice').present?
-
end
-
-
1
def scrape_author_ranks
-
3
author_ranks = {}.with_indifferent_access
-
-
3
author_ranks['overall_rank'] = @page.css('.kindleAuthorRank .overallRank').first.text[/\d+/] if @page.css('.kindleAuthorRank .overallRank').present?
-
-
3
if @page.css('.kindleAuthorRank .nodeRank').present?
-
# For some reason only the first half of these nodes are real, the rest are just duplicates
-
3
total = @page.css('.kindleAuthorRank .nodeRank').count
-
3
return nil if total.odd?
-
-
3
@page.css('.kindleAuthorRank .nodeRank').first(total/2).each_with_index do |node, index|
-
11
author_ranks["sub_category_#{index + 1}"] = {category_id: (ScraperUtilities.get_category_id_from_url(node.css('a').last['href']) if node.css('a').present?),
-
rank: node.text[/\d+/],
-
category_name: node.text.gsub(/\u00A0/, ' ').squish.gsub(/#\d+ in /, '')}.with_indifferent_access
-
end
-
end
-
-
3
author_ranks.presence
-
end
-
-
1
def scrape_pub_date_1
-
2
ScraperUtilities.parse_date_string(@page.css('input#pubdate').first['value']) if @page.css('input#pubdate').present?
-
end
-
-
1
def scrape_pub_date_2
-
# different options because of ze damn Germanz and use the monkey patched Date.parse_international to handle German site
-
3
xpath_condition = ".//b[contains(text(), 'Publisher')] | .//b[contains(text(), 'Verlag')]"
-
-
3
if @page.css('table td.bucket').xpath(xpath_condition).present?
-
@page.css('table td.bucket').xpath(xpath_condition).first.parent.text.gsub('.','').scan(/\((.*?)\)/).flatten.collect do |potential_date|
-
4
begin
-
4
if @tld == '.de'
-
1
Date.parse_international potential_date
-
else
-
3
Date.parse potential_date
-
end
-
rescue ArgumentError
-
1
nil
-
end
-
3
end.compact.first
-
end
-
end
-
-
1
def scrape_pub_date_3
-
if @page.css('#nonHeroSection').present? && @page.css('#nonHeroSection').xpath(".//span[contains(text(), 'Publisher')]").present?
-
publisher_text = @page.css('#nonHeroSection').xpath(".//span[contains(text(), 'Publisher')]").first.parent.parent.css('td')[1].text
-
ScraperUtilities.parse_date_string(publisher_text.scan(/\((.*)\)/).flatten.first) if publisher_text.scan(/\((.*)\)/).flatten.present?
-
end
-
end
-
-
1
def scrape_sold_by_1
-
1
sold_by = nil
-
-
1
@page.css('.buying > b').each do |element|
-
2
sold_by = element.text.strip if element.parent.text.include? 'Ships from and sold by'
-
end
-
-
1
sold_by
-
end
-
-
1
def scrape_sold_by_2
-
1
sold_by = nil
-
-
1
@page.css('.productBlockLabel').each do |td|
-
4
sold_by = td.parent.css('td').last.text.strip if td.text.strip.include? 'Sold by'
-
end
-
-
1
sold_by
-
end
-
-
1
def scrape_sold_by_3
-
1
sold_by = nil
-
-
1
@page.css('td.bucket div.content li').each do |li|
-
10
if li.text.include?('Sold by')
-
1
sold_by = li.text.gsub('Sold by:', '').strip
-
1
sold_by[0] = '' if sold_by.bytes.first == 194
-
end
-
end
-
-
1
sold_by
-
end
-
-
1
def scrape_sold_by_4
-
1
@page.css('#merchant-info').text.strip.split("\n").first.strip.gsub('Ships from and sold by ', '') if @page.css('#merchant-info').present? && @page.css('#merchant-info').text.strip.present?
-
end
-
-
1
def scrape_publisher
-
# different options because of ze damn Germanz
-
3
xpath_condition = ".//b[contains(text(), 'Publisher')] | .//b[contains(text(), 'Verlag')]"
-
3
@page.css('table td.bucket').xpath(xpath_condition).first.parent.text.scan(/:\s(.*)\s\(/).flatten.first if @page.css('table td.bucket').xpath(xpath_condition).present? && @page.css('table td.bucket').xpath(xpath_condition).first.parent.text.scan(/:\s(.*)\s\(/).flatten.present?
-
end
-
-
1
def scrape_dimensions
-
# different options because of ze damn Germanz
-
6
xpath_condition = ".//b[contains(text(), 'Product Dimensions')] | .//b[contains(text(), 'Größe und/oder Gewicht')]"
-
6
if @page.css('table td.bucket').xpath(xpath_condition).present?
-
6
if @tld == '.com'
-
5
@page.css('table td.bucket').xpath(xpath_condition).first.parent.text.split(':').last.gsub(',', '').strip
-
else
-
1
@page.css('table td.bucket').xpath(xpath_condition).first.parent.text.split(':').last.strip
-
end
-
end
-
end
-
-
1
def scrape_weight
-
6
xpath_condition = ".//b[contains(text(), 'Shipping Weight')]"
-
6
if @page.css('table td.bucket').xpath(xpath_condition).present?
-
6
if @tld == '.com'
-
6
@page.css('table td.bucket').xpath(xpath_condition).first.parent.text.split(':').last.gsub(',', '').gsub(' (View shipping rates and policies)', '').strip
-
else
-
@page.css('table td.bucket').xpath(xpath_condition).first.parent.text.split(':').last.gsub(' (View shipping rates and policies)', '').strip
-
end
-
end
-
end
-
-
1
def scrape_page_count
-
2
xpath_condition = ".//li[contains(text(), 'pages')]"
-
2
@page.css('table td.bucket').xpath(xpath_condition).first.text.split(':').last.strip.split(' ').first if @page.css('table td.bucket').xpath(xpath_condition).present?
-
end
-
-
1
def scrape_language
-
# different options because of ze damn Germanz
-
3
xpath_condition = ".//b[contains(text(), 'Language')] | .//b[contains(text(), 'Sprache')]"
-
3
@page.css('table td.bucket').xpath(xpath_condition).first.parent.text.split(': ').last.strip if @page.css('table td.bucket').xpath(xpath_condition).present?
-
end
-
-
1
def scrape_isbn_13_1
-
2
isbn13_xpath_condition = ".//b[contains(text(), 'ISBN-13')]"
-
2
@page.css('table td.bucket').xpath(isbn13_xpath_condition).first.parent.text.split(': ').last.gsub('-', '').strip if @page.css('table td.bucket').xpath(isbn13_xpath_condition).present?
-
end
-
-
1
def scrape_isbn_13_2
-
1
isbn10_xpath_condition = ".//b[contains(text(), 'ISBN-10')]"
-
1
if @page.css('table td.bucket').xpath(isbn10_xpath_condition).present?
-
1
isbn10 = @page.css('table td.bucket').xpath(isbn10_xpath_condition).first.parent.text.split(': ').last.strip
-
1
isbn10.length == 10 ? ISBN_Tools.isbn10_to_isbn13(isbn10) : nil
-
end
-
end
-
-
1
def scrape_title_1
-
2
@page.css('#btAsinTitle').text.split('[').first.strip if @page.css('#btAsinTitle').present? && @page.css('#btAsinTitle').text.split('[')
-
end
-
-
1
def scrape_title_2
-
2
@page.xpath("//h1[@id='title']/text()").text.strip if @page.xpath("//h1[@id='title']").present?
-
end
-
-
1
def scrape_author_1
-
2
@page.css('span.author a.contributorNameID').first.text.strip if @page.css('span.author a.contributorNameID').present?
-
end
-
-
1
def scrape_author_2
-
1
@page.css('.contributorNameTrigger').first.text.strip if @page.css('.contributorNameTrigger').present?
-
end
-
-
1
def scrape_author_3
-
2
@page.css('span.byLinePipe').first.parent.css('a').first.text.strip if @page.css('span.byLinePipe').present? && @page.css('span.byLinePipe').first.parent.css('a').present?
-
end
-
-
1
def scrape_amazon_description
-
1
@page.css('noscript div').inner_text.strip.squish
-
end
-
-
1
def scrape_most_helpful_amazon_reviews
-
3
@page.css('div#revMHRL>div')[0..2].each_with_object([]) do |review_root,most_helpful_reviews|
-
9
author = review_root.css('div span span a').present? ? review_root.css('div span span a').first.inner_text.strip.squish : nil
-
9
date = review_root.css('div span').present? && review_root.css('div span')[3].present? ? ScraperUtilities.parse_date_string(review_root.css('div span')[3].inner_text.strip.squish.split(' on ').last) : nil
-
-
9
if review_root['id'].strip.include? 'rev-dpReviewsMostHelpfulAUI'
-
6
like_stats = review_root.css('div span').present? ? review_root.css('div span').first.inner_text.strip.squish.split(' ') : []
-
6
found_useful_count, total_vote_count = like_stats[0], like_stats[2]
-
6
star_count = review_root.css('div div a').present? && review_root.css('div div a').first['title'].present? ? review_root.css('div div a').first['title'].strip.squish.split(' ').first : nil
-
6
review_data_root = review_root.css("div#revData#{review_root['id'][3..-1]} div").present? ? review_root.css("div#revData#{review_root['id'][3..-1]} div")[0] : nil
-
6
if review_data_root.present? && review_data_root.css('span.MHRHead').present?
-
1
review = review_data_root.css('span').length>2 ? review_data_root.css('span')[0..-3].inner_text.strip.squish + ' ' : review_data_root.css('span')[0].inner_text.strip.squish
-
1
review += review_data_root.css('span').length>2 ? review_data_root.css('span')[-2]['data-columnbalancing-showfullreview'][9..-3].strip.squish : review_data_root.css('span')[1]['data-columnbalancing-showfullreview'][9..-3]
-
else
-
5
review = review_data_root.present? ? review_data_root.inner_text.strip.squish : nil
-
end
-
else
-
3
like_stats = review_root.css('div').present? ? review_root.css('div').first.inner_text.strip.squish.split(' ') : []
-
3
found_useful_count, total_vote_count = like_stats[0], like_stats[2]
-
3
star_count = review_root.css('div span').first['title'].present? ? review_root.css('div span').first['title'].strip.squish.split(' ').first : nil
-
3
review_data_root = review_root.css('div.reviewText>div.drkgry')[0]
-
3
if review_data_root.present? && review_data_root.css('span.MHRHead').present?
-
1
review = review_data_root.css('span.MHRHead').inner_text.strip.squish + ' '
-
1
review += review_data_root.css('span')[1].inner_text.strip.squish
-
else
-
2
review = review_data_root.present? ? review_data_root.inner_text.strip.squish : nil
-
end
-
-
end
-
most_helpful_reviews << {found_useful_count: found_useful_count,
-
total_vote_count: total_vote_count,
-
star_count: star_count,
-
author: author,
-
date: date,
-
9
review: review}
-
end
-
end
-
-
1
def scrape_physical_isbn13s
-
2
physical_isbn13s = []
-
2
@page.xpath(".//div[@class='cBoxInner']//tbody").each do |book_version|
-
34
if book_version['id'].present? && (book_version['id'].include?('paperback') || book_version['id'].include?('hardcover')) && book_version.children.present?
-
6
search_isbn = book_version.children.first['id'].gsub('tmm_', '')
-
6
physical_isbn13s << ISBN_Tools.isbn10_to_isbn13(search_isbn) if ISBN_Tools.is_valid?(search_isbn)
-
end
-
end
-
-
2
physical_isbn13s
-
end
-
-
1
def scrape_author_tag_1
-
3
@page.css('div.buying/a').first if @tld == '.co.uk'
-
end
-
-
1
def scrape_author_tag_2
-
2
@page.css('div.buying/span/a').first
-
end
-
-
1
def scrape_author_tag_3
-
3
@page.css('span.author a').first
-
end
-
-
1
def scrape_author_tag_4
-
@page.css('span.contributorNameTrigger a').first if @tld == '.co.uk'
-
end
-
-
1
def scrape_author_asin_1
-
1
author = @page.search(".//span[@class='contributorNameTrigger']").first
-
1
author.parent.search('.//input').first['value'] if author.present?
-
end
-
-
1
def scrape_author_asin_2
-
if scrape_author_tag.present?
-
author_url = scrape_author_tag['href']
-
-
if author_url.present? && scrape_author_tag['asin'].present? && Utilities.is_author_asin?(ScraperUtilities.extract_asin_from_url(author_url))
-
ScraperUtilities.extract_asin_from_url(author_url)
-
end
-
end
-
end
-
-
1
def scrape_author_asin_3
-
1
if scrape_author_tag.present?
-
1
author_url = scrape_author_tag['href']
-
-
1
if author_url.present? && author_url.exclude?('UTF8') && Utilities.is_author_asin?(ScraperUtilities.extract_asin_from_url(author_url))
-
1
ScraperUtilities.extract_asin_from_url(author_url)
-
end
-
end
-
end
-
-
1
def scrape_author_asin_4
-
ScraperUtilities.extract_asin_from_url(@page.css('.author_page_link a').first.attributes['href'].value) if @page.css('.author_page_link a').present?
-
end
-
-
1
def scrape_author_page_url
-
scrape_author_tag.present? ? force_absolute_url(scrape_author_tag['href']) : nil
-
end
-
-
1
def scrape_author_name
-
scrape_author_tag.present? ? scrape_author_tag.text.strip : nil
-
end
-
-
# Only tested on a few pages
-
1
def scrape_parent_asin_1
-
1068
if @page.css('script').select{|x| x.text.include? 'twister-media-matrix'}.present?
-
1964
@page.css('script').select{|x| x.text.include? 'twister-media-matrix'}.first.text.split(@asin).last.split(', ')[1].gsub("\"",'') if @page.css('script').select{|x| x.text.include? 'twister-media-matrix'}.first.text.include?(@asin)
-
end
-
end
-
-
# Only tested on a few pages
-
1
def scrape_parent_asin_2
-
115
if @page.css('script').select{|x| x.text.include? 'media-matrix'}.present?
-
81
@page.css('script').select{|x| x.text.include? 'media-matrix'}.first.text.split('tasParentAsin=').last.split('&').first if @page.css('script').select{|x| x.text.include? 'media-matrix'}.first.text.include?('tasParentAsin=')
-
end
-
end
-
-
# TODO Refactor, scrape methods shouldn't create other page objects, there is no good way to handle captchas and retries
-
# This only works on .de for now
-
1
def scrape_competitive_related_format_data
-
# We currently only use this on the .de site (it also works on .co.uk) which is why "Mass Market Paperback" maps to
-
# "paperback". On the .com site this maps to mass_market_paperback.
-
10
format_to_dom_format_hash = {'Paperback' => 'paperback',
-
'Taschenbuch' => 'paperback',
-
'Mass Market Paperback' => 'paperback',
-
'Hardcover' => 'hardcover',
-
'Gebundene Ausgabe' => 'hardcover',
-
'Kindle Edition' => 'kindle'}
-
10
competitive_details = {parent_asin: scrape_parent_asin, featured_title: nil, competitive_titles: [], valid_page: false}
-
-
# We need to be able to map a book_format to the specific string that represents it in the <tr>'s on Amazon in
-
# the related format boxes. We use .include because Kindle Editions come in a few forms but all contain 'Kindle Edition'
-
10
if scrape_book_format.present?
-
9
dom_format = scrape_book_format.exclude?('Kindle') ? format_to_dom_format_hash[scrape_book_format] : format_to_dom_format_hash['Kindle Edition']
-
-
9
plus_link = nil
-
-
# Can't do anything without a parent_asin and an active row, this usually indicates a lack of related format box or
-
# non purchasable title, leave valid_page set to false
-
9
if scrape_parent_asin.present? && @page.css('tr.activeRow').present?
-
8
competitive_details[:valid_page] = true
-
-
# Check if this title is the featured title
-
8
featured = @page.css('tr.activeRow').first.parent['id'].include? '_winner'
-
-
# Get the tr for the featured title whether its this @asin or not and set the featured asin
-
8
featured_tr = nil
-
8
if @page.xpath(".//tbody[@id='#{dom_format}_meta_binding_winner']/tr").present?
-
8
featured_tr = @page.xpath(".//tbody[@id='#{dom_format}_meta_binding_winner']/tr").first
-
8
competitive_details[:featured_title] = featured_tr['id'].gsub('tmm_', '') if featured_tr.present?
-
end
-
-
# Get the plus link for the featured title if it exists, if there is no buttonTD td element, there is no plus link
-
8
plus_link = Urls.amazon_related_format_url(scrape_parent_asin, dom_format, '.de') if featured_tr.present? && featured_tr.css('td.tmm_buttonTD').present?
-
-
# If this (@asin) is not a featured title and the featured title is valid, add this to the competitive list
-
8
if !featured && featured_tr.present? && featured_tr.css('td.price').text.include?('EUR')
-
2
competitive_details[:competitive_titles] << {asin: featured_tr['id'].gsub('tmm_', '')}
-
end
-
-
# If there is a plus link for this related format, collect competitive titles
-
8
if plus_link.present?
-
7
@plus_link_page ||= AmazonProductPage.new plus_link
-
-
7
if @plus_link_page.ok?
-
7
@plus_link_page.dom.css('tr').each do |tr|
-
# Only return valid purchasable competitive titles that aren't the @asin
-
15
if tr.css('td.price').text.include? 'EUR'
-
10
asin = tr['id'].gsub('tmm_', '')
-
10
competitive_details[:competitive_titles] << {asin: asin} if asin != @asin
-
end
-
end
-
end
-
end
-
end
-
end
-
-
10
competitive_details
-
end
-
-
1
def scrape_related_formats_with_amazon_price
-
2
contains_currency_condition = ScraperUtilities::CURRENCY_CONDITION
-
2
%w[paperback hardcover mass_market_paperback kindle].each_with_object({}) do |format, hash|
-
8
@page.xpath(".//tbody[@id='#{format}_meta_binding_winner']").xpath("(.//td[@class=' price ' and #{contains_currency_condition}])").each do |element|
-
3
tr = element.xpath(".//ancestor::tr[@class='bucketBorderTop'][1]").first
-
3
if tr.present?
-
# 2nd column is format
-
2
format_title = tr.css('td.tmm_bookTitle').first.text.squish.strip
-
2
hash[format_title] = {price: ScraperUtilities.cleanse_price(element.text.strip), asin: tr['id'].gsub('tmm_','')}
-
2
break
-
end
-
end
-
end
-
end
-
-
# TODO Refactor, scrape methods shouldn't create other page objects, there is no good way to handle captchas and retries
-
1
def scrape_cheapest_print_list_price
-
2
return {format: 'Original Title', print_list_price: scrape_amazon_list_price} if scrape_amazon_list_price.present?
-
-
# Find cheapest related format then go to its page to the get the print list price
-
2
related_formats = scrape_related_formats_with_amazon_price.reject {|x| x.include?('Kindle Edition') || x.downcase.include?('bargain')}
-
-
1
ordered_formats = ['Mass Market Paperback', 'Paperback', 'Hardcover'].each_with_object({}) do |format, hash|
-
3
hash[format] = related_formats[format] if related_formats[format].present?
-
end
-
-
# Check each format in order and return if it has a valid print list price
-
1
ordered_formats.each_pair do |format, details|
-
1
@ordered_format_pages ||= {}
-
1
@ordered_format_pages[format] ||= AmazonProductPage.by_asin_and_tld details[:asin], '.com'
-
1
if @ordered_format_pages[format].ok?
-
1
list_price = @ordered_format_pages[format].scrape_amazon_list_price
-
1
return {format: format, print_list_price: list_price} if list_price.present?
-
end
-
end
-
-
nil
-
end
-
-
1
def scrape_availability_1
-
2
@page.css('#buybox #availability').text.squish if @page.css('#buybox #availability').present?
-
end
-
-
1
def scrape_availability_2
-
1
@page.css('.buying .availOrange').text.squish if @page.css('.buying .availOrange').present?
-
end
-
-
1
def scrape_kindle_unlimited
-
3
@page.css('.buying img[alt="Kindle Unlimited"]').present?
-
end
-
end
-
1
class AmazonSearchPage < AmazonPage
-
1
def self.by_isbn_or_asin_and_tld(isbn_or_asin, tld)
-
new Urls.amazon_search_page(isbn_or_asin, tld)
-
end
-
-
1
def initialize(url, user_agent = 'Mac FireFox')
-
24
super url, user_agent
-
end
-
-
1
def search_results
-
@page.css('.results .celwidget').collect do |widget|
-
details = {}
-
-
details[:category_name] = @page.css('#breadCrumb').text.squish.gsub('›', '>') if @page.css('#breadCrumb').present?
-
details[:rank] = widget['id'].gsub('result_', '').to_i + 1
-
details[:author_name] = widget.css('.newaps .med.reg').text.squish.scan(/^by (.*?) \(/).flatten.first.squish if widget.css('.newaps .med.reg').present?
-
details[:price] = ScraperUtilities.cleanse_price(widget.css('.rsltL .digp .bld.red.lrg, .rsltL .newp .bld.red.lrg').text.strip) if widget.css('.rsltL .digp .bld.red.lrg, .rsltL .newp .bld.red.lrg').present?
-
details[:title] = widget.css('.newaps a .lrg.bold').text.strip if widget.css('.newaps a .lrg.bold').present?
-
details[:asin] = ScraperUtilities.extract_asin_from_url(widget.css('.image a').first['href']) if widget.css('.image a').present?
-
details[:url] = widget.css('.image a').first['href'] if widget.css('.image a').present?
-
-
details
-
end
-
end
-
-
# Finds the matching url from the list of search results
-
# Assumes the first URL is the match if there is only one result, otherwise consults url_hints, which is an array of asins
-
1
def matching_url_from_search_results(url_hints)
-
4
urls = scrape_search_results_urls
-
4
urls.present? && urls.count == 1 ? urls.first : ScraperUtilities.match_url_with_asins(urls, url_hints)
-
end
-
-
1
def scrape_search_results_urls_1
-
@page.css('.results h3 > a').collect do |anchor|
-
12
anchor.attributes['href'].try(:text)
-
11
end.compact
-
end
-
-
1
def scrape_search_results_urls_2
-
@page.css('.results .image a').collect do |anchor|
-
1
anchor.attributes['href'].try(:text)
-
4
end.compact
-
end
-
-
1
def scrape_search_result_asins_1
-
@page.css('.asinReviewsSummary').collect{|x| x['name']} if @page.css('.asinReviewsSummary').present?
-
end
-
-
1
def scrape_search_result_asins_2
-
anchors = @page.css('.results h3 > a')
-
anchors.collect{|anchor| ScraperUtilities.extract_asin_from_url(anchor.attributes['href'].text)} if anchors.present?
-
end
-
-
1
def scrape_search_result_asins_3
-
anchors = @page.css('.results .image a')
-
anchors.collect{|anchor| ScraperUtilities.extract_asin_from_url(anchor.attributes['href'].text)} if anchors.present?
-
end
-
-
# Does not return the url for current page, only OTHER pagination links
-
# Fills in pagination links not visible that exist after the last given pagination link, thus only truly works from the first result page for now
-
1
def scrape_pagination_urls
-
# Get all urls from active, numbered pagination links
-
urls = @page.css('#pagn .pagnLink a').collect do |element|
-
force_absolute_url element['href']
-
end
-
# Fill in urls past the active pagination links up to the last disabled, numerical pagination link
-
if @page.css('#pagn .pagnDisabled').present?
-
max_page_number = @page.css('#pagn .pagnDisabled').text.strip.to_i
-
min_page_number = urls.last.scan(/page=(\d+)/).flatten.first.to_i + 1
-
(min_page_number..max_page_number.to_i).each_with_object(urls) do |num|
-
urls << urls.last.gsub(/sr_pg_\d+/, "sr_pg_#{num}").gsub(/page=\d+/, "page=#{num}")
-
end
-
end
-
-
urls.presence
-
end
-
-
1
def scrape_search_page_number
-
@page.css('#pagn .pagnCur').text.to_i
-
end
-
end
-
1
class AppleTopBooksRssFeed < Page
-
1
def self.by_category_id_and_type(category_id, type)
-
9
new Urls.apple_book_category_feed(category_id, type)
-
end
-
-
1
def initialize(url, user_agent = 'Mac FireFox')
-
9
super url, user_agent
-
end
-
-
1
def stats
-
1
@page.xpath('//entry').each_with_index.collect do |element, index|
-
{rank: index + 1,
-
title: get_title(element),
-
author: get_author(element),
-
itunes_id: get_itunes_id(element),
-
200
price: get_price(element)}
-
end
-
end
-
-
1
private
-
-
1
def get_title(element)
-
202
element.xpath('name').text.first(255).presence if element.xpath('name').present?
-
end
-
-
1
def get_author(element)
-
202
element.xpath('artist').text.presence if element.xpath('artist').present?
-
end
-
-
1
def get_itunes_id(element)
-
202
element.xpath('id').attribute('im:id').value.presence if element.xpath('id').present?
-
end
-
-
1
def get_price(element)
-
202
(element.xpath('price').attribute('amount').value.to_f * 100).to_i if element.xpath('price').present? && element.xpath('price').attribute('amount').value.present?
-
end
-
end
-
1
class BnBookPage < BnPage
-
1
def self.by_ean(ean)
-
15
new Urls.bn_book_page(ean)
-
end
-
-
# On physical book pages sometimes the physical price and nook price will be listed but it will always get the main
-
# physical price. On Nook pages only the Nook price will be listed so the first item will be blank and then the Nook
-
# price will be found. Textbook pages are a totally different layout.
-
1
def scrape_price_1
-
1
ScraperUtilities.cleanse_price(@page.css('.product-price .price').text.strip) if @page.css('.product-price .price').present?
-
end
-
-
1
def scrape_price_2
-
1
ScraperUtilities.cleanse_price(@page.css('.nook-price .price').text.strip) if @page.css('.nook-price .price').present?
-
end
-
-
1
def scrape_price_3
-
1
ScraperUtilities.cleanse_price(@page.css('.buy-box-textbook .price').first.text.strip) if @page.css('.buy-box-textbook .price').present?
-
end
-
-
1
def scrape_nook_price
-
1
ScraperUtilities.cleanse_price(@page.css('.nook-price .price').text.strip) if @page.css('.nook-price .price').present?
-
end
-
-
1
def scrape_nook_list_price
-
1
ScraperUtilities.cleanse_price(@page.css('.nook-price .list').text.strip) if @page.css('.nook-price .list').present?
-
end
-
-
1
def scrape_list_price_1
-
1
ScraperUtilities.cleanse_price(@page.css('.list').first.text.strip) if @page.css('.list').present?
-
end
-
-
1
def scrape_list_price_2
-
# textbook style List Price scrape
-
1
@page.css('.product-details-textbook ul li').each do |li|
-
1
if li.text.include? 'List Price'
-
1
return li.css('.value').present? ? ScraperUtilities.cleanse_price(li.css('.value').text.strip) : nil
-
end
-
end
-
-
nil
-
end
-
-
1
def scrape_average_rating
-
1
@page.css('span.starDisplay > span.stars-large').attr('class').value.split(' ').second.gsub('r', '').gsub('h', '.5') if @page.css('div.reviews-share').present? && @page.css('span.starDisplay').present?
-
end
-
-
1
def scrape_rating_count
-
2
@page.css('span.starDisplay > span.total').text.scan(/\d+/).first if @page.css('div.reviews-share').present? && @page.css('span.starDisplay').present?
-
end
-
-
1
def scrape_review_count
-
1
scrape_rating_count
-
end
-
-
1
def scrape_sales_rank
-
1
product_details_elements = @page.css('div.product-details ul li')
-
1
if @page.css('div.reviews-share').present? && product_details_elements.present?
-
1
product_details_elements.each do |li|
-
8
return li.text.gsub(',', '').scan(/\d+/).first if li.text.include? 'Sales rank'
-
end
-
end
-
-
nil
-
end
-
-
1
def scrape_related_format_data
-
1
@page.css('li.format').each do |element|
-
1
if element.text.include? 'NOOK'
-
1
nook_price_element = element.css('div.bn-price a').first
-
1
nook_price = nook_price_element.present? ? ScraperUtilities.cleanse_price(nook_price_element.text.strip) : nil
-
1
nook_ean = nook_price_element.present? && nook_price_element['data-bn-rel'].present? ? ScraperUtilities.parse_ean_from_bn_url(nook_price_element['data-bn-rel']) : nil
-
1
return {'NOOK Book' => {price: nook_price, ean: nook_ean}}
-
end
-
end
-
-
nil
-
end
-
-
1
def scrape_also_boughts
-
@page.css('.display-tile-item').each_with_object([]) do |element, array|
-
21
title_element = element.css('a > img').first
-
21
if title_element.present?
-
21
author_element = element.css('.contributor').first
-
21
price_element = element.css('.price').first
-
array << {title: title_element.attributes['alt'].text.squish,
-
21
ean: (element['data-bn-ean']),
-
21
author: (author_element.text.squish if author_element.present?),
-
42
price: (ScraperUtilities.cleanse_price(price_element.text.strip) if price_element.present?)}
-
end
-
1
end.presence
-
end
-
-
1
def scrape_nook_ean
-
2
@page.css('li.format').each do |element|
-
2
if element.text.include? 'NOOK'
-
2
return element.css('span.name').present? && element.css('span.name').first.parent['href'].present? ? ScraperUtilities.parse_ean_from_bn_url(element.css('span.name').first.parent['href']) : nil
-
end
-
end
-
-
nil
-
end
-
-
#TODO Unused, delete?
-
#def self.determine_ean_and_format_from_li(li)
-
# div = li.css('div')
-
# ean = div.first['data-bn-ean']
-
# format = div.css('div.price-format > a > span.format').text
-
#
-
# if format.include? 'BN.com'
-
# # If you see this in traversal it means the book is broken/redirection loop
-
# return ean,nil
-
# end
-
#
-
# return ean,format
-
#end
-
end
-
1
class BnCategoryPage < BnPage
-
1
def self.by_category_id_and_book_format_and_page_number(category_id, book_format, page_number)
-
new Urls.bn_category_page(category_id, book_format, page_number)
-
end
-
-
1
def initialize(url, user_agent = 'Mac FireFox')
-
@page = HttpHelper.get_bn_category_page_html url, user_agent
-
super url, user_agent
-
end
-
end
-
1
class BnNookDailyFindPage < BnPage
-
1
def initialize(user_agent = 'Mac FireFox')
-
3
super 'http://www.barnesandnoble.com/u/ebook-nook-daily-find-bargain-deal/379003102/', user_agent
-
end
-
-
1
def scrape_daily_find_book
-
1
details = {}
-
1
details[:daily_deal_type] = @page.css('.st-single-product .header').text.squish if @page.css('.st-single-product .header').present?
-
1
details[:title] = @page.css('.st-single-product .body .title').text.squish
-
1
details[:author_name] = @page.css('.st-single-product .body .contributor').text.squish
-
1
details[:price] = ScraperUtilities.cleanse_price(@page.css('.st-single-product .body .price').text)
-
1
ean = @page.css('.st-single-product .body .title a').present? ? ScraperUtilities.parse_ean_from_bn_url(@page.css('.st-single-product .body .title a').first.attributes['href'].value) : nil
-
1
if ISBN_Tools.is_valid? ean
-
1
details[:isbn] = ean
-
else
-
details[:bn_id] = ean
-
end
-
-
1
details
-
end
-
-
1
def scrape_daily_find_carousels
-
1
daily_finds = []
-
1
@page.css('.st-carousel').each do |carousel|
-
4
carousel.css('.product-root-node').each_with_index do |product_container, index|
-
80
details = {}
-
80
details[:daily_deal_type] = carousel.css('.header h3').text.squish
-
80
details[:rank] = index + 1
-
80
details[:title] = product_container.css('.linked-image img').first.attributes['alt'].value.scan(/Title: (.*?), Author:/).flatten.first.squish if product_container.css('.linked-image img').present? && product_container.css('.linked-image img').first.attributes['alt'].value.present?
-
80
details[:title] = product_container.css('.product-title').text.squish if details[:title].blank?
-
80
details[:author_name] = product_container.css('.contributers-line').text.squish
-
80
ean = product_container.css('.product-title a').present? ? ScraperUtilities.parse_ean_from_bn_url(product_container.css('.product-title a').first.attributes['href'].value) : nil
-
80
if ISBN_Tools.is_valid? ean
-
75
details[:isbn] = ean
-
else
-
5
details[:bn_id] = ean
-
end
-
-
80
daily_finds << details
-
end
-
end
-
-
1
daily_finds
-
end
-
end
-
1
class BnNookUnder299 < BnPage
-
1
START_NUMBERS = %w[1 31 61 91].freeze
-
1
def self.by_start_number(num)
-
3
raise 'Invalid Starting Number' unless START_NUMBERS.include? num.to_s
-
-
3
new "http://www.barnesandnoble.com/u/ebooks-nook-books-bargain-deal-3-or-less/379003858?start=#{num}"
-
end
-
-
1
def book_details
-
1
book_details = []
-
1
rank = query_parameters['start'].first.to_i
-
1
@page.css('.result').each_with_index do |element, index|
-
30
details = {}
-
30
details[:daily_deal_type] = 'NOOK Books Under $2.99'
-
30
details[:rank] = rank + index
-
30
details[:title] = element.css('.image-block img').first.attributes['alt'].value.scan(/Title: (.*?), Author:/).flatten.first.squish if element.css('.image-block img').present? && element.css('.image-block img').first.attributes['alt'].value.present?
-
30
details[:title] = element.css('.title a').first.text.squish if details[:title].blank? && element.css('.title a').present?
-
30
details[:author] = element.css('.contributor a').first.text.squish if element.css('.contributor a').present?
-
30
details[:price] = element.css('.pricing.bn-price strong').first.text.squish.gsub(/\$|\./, '') if element.css('.pricing.bn-price strong').present?
-
30
details[:url] = force_absolute_url(element.css('.title a').first.attributes['href'].text) if element.css('.title a').present?
-
30
ean = details[:url].present? ? ScraperUtilities.parse_ean_from_bn_url(details[:url]) : nil
-
30
if ISBN_Tools.is_valid? ean
-
24
details[:isbn] = ean
-
else
-
6
details[:bn_id] = ean
-
end
-
30
book_details << details
-
end
-
-
1
book_details
-
end
-
end
-
1
class BnPage < Page
-
1
def initialize(url, user_agent = 'Mac FireFox')
-
24
begin
-
24
super url, user_agent
-
rescue Net::HTTP::Persistent::Error
-
@net_persistent_error = true
-
{}
-
end
-
end
-
-
1
def net_persistent_error?
-
3
@net_persistent_error
-
end
-
end
-
1
class BnSearchPage < BnPage
-
1
def self.by_isbn(isbn)
-
new Urls.bn_search_page(isbn)
-
end
-
-
1
def self.by_title_and_author_and_book_format(title, author, book_format)
-
3
new Urls.bn_search_page_by_details(title, author, book_format), 'Mac FireFox', book_format
-
end
-
-
1
def initialize(url, user_agent = 'Mac FireFox', book_format = '')
-
3
super url, user_agent
-
3
@search_category = if book_format.include? 'Paperback'
-
'Paperback'
-
elsif book_format.include? 'Kindle'
-
3
'NOOK'
-
else
-
'Hardcover'
-
end
-
end
-
-
1
def ean_for(pub_date)
-
# check if any search results show same pub date as book version, then check for a book of your type in that block
-
3
if @page.search(".//li[@id='search-result0']/div/section/p/a").present?
-
3
@page.search(".//span[@class='date']").each do |date|
-
5
begin
-
5
if Date.strptime(date.text, '(%m/%d/%Y)') == pub_date
-
# Traverse to the top of the block and search if block has a book of the correct type to get the ean
-
1
date.parent.parent.parent.parent.search(".//table[@class='displayed-formats']//tr//a").each do |a|
-
#scrape url and get EAN which is ISBN or BN ID (URLs are in 2 possible formats)
-
1
return ScraperUtilities.parse_ean_from_bn_url(a['data-bn-rel']) if a.attr('data-bntrack').include?(@search_category)
-
end
-
end
-
rescue ArgumentError => e
-
4
Rails.logger.tagged('cleanup') {Rails.logger.info "Pub date in date bound search flow returned malformed date string, can't parse date: #{e.message}"}
-
end
-
end
-
end
-
-
nil
-
end
-
-
1
def no_results?
-
@page.css('div.search-noresults-message').present? && @page.css('div.search-noresults-message').text.strip.include?('Sorry, we could not find what you were looking for.')
-
end
-
end
-
1
class BookVersionCategory < ActiveRecord::Base
-
# Attributes
-
-
1
attr_accessible :warehouse_book_version, :warehouse_book_version_id, :warehouse_category, :warehouse_category_id,
-
:warehouse_region, :warehouse_region_id, :category_name
-
-
# Associations
-
-
1
belongs_to :warehouse_book_version, inverse_of: :book_version_categories
-
1
belongs_to :warehouse_category, inverse_of: :book_version_categories
-
1
belongs_to :warehouse_region, inverse_of: :book_version_categories
-
-
# Validations
-
-
1
validates_presence_of :warehouse_book_version_id, :warehouse_region_id, :category_name
-
1
validates_uniqueness_of :category_name, scope: :warehouse_book_version_id
-
end
-
1
class BookVersionException < ActiveRecord::Base
-
# Attributes
-
-
1
attr_accessible :warehouse_book_version, :warehouse_book_version_id, :warehouse_date, :warehouse_date_id,
-
:amazon_not_found_in_search, :amazon_no_image, :amazon_no_buy_button, :amazon_no_price,
-
:bn_not_found_in_search, :no_isbn, :amazon_ambiguous_result, :apple_invalid
-
-
# Associations
-
-
1
belongs_to :warehouse_book_version, inverse_of: :book_version_exceptions
-
1
belongs_to :warehouse_date, inverse_of: :book_version_exceptions
-
-
# Validations
-
-
1
validates_presence_of :warehouse_book_version_id
-
1
validates_presence_of :warehouse_date_id
-
end
-
1
class BookVersionStatusCollection < Mongo::Collection
-
1
def initialize(opts = {})
-
12
super 'book_version_statuses', $mongodb, opts
-
end
-
-
1
def set_book_version_status(warehouse_book_version_id, status)
-
4
update({_id: warehouse_book_version_id}, {'$set' => {status: status}}, upsert: true)
-
end
-
end
-
1
class BooklrStat < ActiveRecord::Base
-
# Attributes
-
-
1
attr_accessible :date, :number_of_books, :number_of_book_versions, :number_of_ingested_book_versions,
-
:number_of_book_version_stats, :amazon_average_rating_total, :amazon_review_count_total, :barnes_average_rating_total,
-
:barnes_rating_count_total, :barnes_review_count_total, :amazon_sales_rank_total, :amazon_sales_rank_category_total,
-
:barnes_sales_rank_total, :amazon_list_price_total, :amazon_price_total, :sub_category1_id_total,
-
:sub_category1_tree_total, :sub_category1_rank_total, :sub_category2_id_total, :sub_category2_tree_total,
-
:sub_category2_rank_total, :sub_category3_id_total, :sub_category3_tree_total, :sub_category3_rank_total,
-
:likes_total, :digital_list_price_total, :bn_nook_price_total, :bn_nook_list_price_total, :bn_price_total,
-
:also_bought_total, :bought_after_viewing_total, :frequently_bought_together_total, :bn_also_bought_total,
-
:similar_items_by_category_total, :amazon_related_format_data_total, :bn_related_format_data_total,
-
:author_ranks_total, :bn_list_price_total, :itunes_price_total, :itunes_average_rating_total,
-
:itunes_rating_count_total, :goodreads_work_average_rating_total, :goodreads_work_rating_count_total,
-
:goodreads_work_review_count_total, :goodreads_work_added_by_count_total, :goodreads_work_to_read_count_total,
-
:goodreads_edition_average_rating_total, :goodreads_edition_rating_count_total, :goodreads_edition_review_count_total,
-
:goodreads_edition_added_by_count_total
-
-
# Validations
-
-
1
validates_presence_of :date
-
1
validates_uniqueness_of :date
-
-
# Miscellaneous
-
-
1
def self.amazon_attributes
-
1
%w[amazon_average_rating_total amazon_review_count_total amazon_sales_rank_total amazon_sales_rank_category_total
-
amazon_list_price_total amazon_price_total sub_category1_id_total sub_category1_tree_total sub_category1_rank_total
-
sub_category2_id_total sub_category2_tree_total sub_category2_rank_total sub_category3_id_total sub_category3_tree_total
-
sub_category3_rank_total likes_total digital_list_price_total author_ranks_total]
-
end
-
-
1
def self.barnes_and_noble_attributes
-
1
%w[barnes_average_rating_total barnes_rating_count_total barnes_review_count_total barnes_sales_rank_total bn_nook_price_total
-
bn_nook_list_price_total bn_price_total bn_list_price_total]
-
end
-
-
1
def self.itunes_attributes
-
1
%w[itunes_price_total itunes_average_rating_total itunes_rating_count_total]
-
end
-
-
1
def self.goodreads_attributes
-
1
%w[goodreads_work_average_rating_total goodreads_work_rating_count_total goodreads_work_review_count_total
-
goodreads_work_added_by_count_total goodreads_work_to_read_count_total goodreads_edition_average_rating_total
-
goodreads_edition_rating_count_total goodreads_edition_review_count_total goodreads_edition_added_by_count_total]
-
end
-
-
1
def self.customer_behavior_attributes
-
1
%w[also_bought_total bought_after_viewing_total frequently_bought_together_total bn_also_bought_total
-
similar_items_by_category_total amazon_related_format_data_total bn_related_format_data_total]
-
end
-
-
1
def coverage_for(attribute)
-
2
self.send(attribute).present? ? self.send(attribute) / (number_of_ingested_book_versions || number_of_book_versions).to_f : 0.0
-
end
-
-
1
def method_missing(method_name, *args, &block)
-
3
method_name =~ /^coverage_for_(.+)$/ ? coverage_for($1) : super
-
end
-
end
-
1
class CategoryStat < ActiveRecord::Base
-
1
attr_accessible :best_rank, :worst_rank, :book_version_count, :mean_rank, :median_rank, :category_name,
-
:best_rank_book_version, :best_rank_book_version_id, :worst_rank_book_version, :worst_rank_book_version_id,
-
:warehouse_region, :warehouse_region_id, :warehouse_date, :warehouse_date_id, :warehouse_category,
-
:warehouse_category_id
-
-
1
delegate :date, to: :warehouse_date
-
1
delegate :tld, to: :warehouse_category
-
-
1
belongs_to :best_rank_book_version, class_name: 'WarehouseBookVersion', foreign_key: 'best_rank_book_version_id', inverse_of: :best_rank_category_stats
-
1
belongs_to :worst_rank_book_version, class_name: 'WarehouseBookVersion', foreign_key: 'worst_rank_book_version_id', inverse_of: :worst_rank_category_stats
-
1
belongs_to :warehouse_region, inverse_of: :warehouse_categories
-
1
belongs_to :warehouse_date, inverse_of: :warehouse_categories
-
1
belongs_to :warehouse_category, inverse_of: :category_stats
-
end
-
1
class CategoryStatsCollection < Mongo::Collection
-
1
def initialize(date, opts = {})
-
super "category_stats_#{date.to_s.gsub('-', '')}", $mongodb, opts
-
end
-
-
1
def add_category_stats(category_stats)
-
insert category_stats
-
end
-
end
-
1
class GoodreadsBookPage < Page
-
1
def self.by_key(key)
-
3
new Urls.goodreads_book_page(key)
-
end
-
-
1
def initialize(url, user_agent = 'Windows Mozilla')
-
3
super url, user_agent
-
end
-
-
1
def rating_details
-
3
ratings_details = nil
-
-
# rating details popup box is in a <script> tag with the DOM elements, find it then re-parse it
-
3
if @page.css('#bookMeta script').present?
-
3
text = @page.css('#bookMeta script').first.text
-
3
doc = Nokogiri::HTML(text.gsub("\\n", '').gsub("\\", '').gsub(/\\\"/, '').squish)
-
-
3
if doc.css('table').present? && doc.css('table').count > 1
-
3
values = doc.css('table').last.css('span').collect(&:text)
-
3
ratings_details = {goodreads_work_average_rating: values[0],
-
goodreads_work_rating_count: values[1],
-
goodreads_work_review_count: values[2],
-
goodreads_work_added_by_count: values[3],
-
goodreads_work_to_read_count: values[4],
-
goodreads_edition_average_rating: values[5],
-
goodreads_edition_rating_count: values[6],
-
goodreads_edition_review_count: values[7],
-
goodreads_edition_added_by_count: values[8]}.with_indifferent_access
-
-
3
if doc.css('table#rating_distribution').present?
-
3
doc.css('table#rating_distribution tr')[1..5].each do |tr|
-
15
star_count = tr.css('th').first.text.squish
-
15
value = tr.css('td').last.text.squish
-
15
ratings_details["goodreads_#{star_count}_star_count"] = value
-
end
-
end
-
end
-
end
-
-
3
ratings_details
-
end
-
end
-
1
class Page
-
1
attr_reader :url, :user_agent, :response_code
-
-
1
def initialize(url, user_agent = nil)
-
220
@url = url
-
220
@user_agent = user_agent
-
220
@uri = URI.parse(URI.encode(url))
-
220
@base_url = "#{@uri.scheme}://#{@uri.host}"
-
220
@tld = case
-
when @uri.hostname.ends_with?('.com')
-
194
'.com'
-
when @uri.hostname.ends_with?('.co.uk')
-
2
'.co.uk'
-
when @uri.hostname.ends_with?('.de')
-
24
'.de'
-
else
-
nil
-
end
-
220
@scraped_at = Time.current
-
220
begin
-
220
@page ||= user_agent.present? ? HttpHelper.get_html_with_mechanize_no_rescue(url, user_agent) : HttpHelper.get_html(url)
-
219
@ok = @page.present?
-
219
@response_code = '200'
-
219
@socks_error = false
-
1
rescue *HTTP_ERRORS => e
-
1
if e.class == Mechanize::ResponseCodeError
-
1
error = {error_class: Mechanize::ResponseCodeError.to_s, code: e.response_code}
-
1
@response_code = e.response_code
-
1
@error_class = e.class.to_s
-
elsif e.is_a? SOCKSError
-
error = {error_class: e.class.to_s}
-
@socks_error = true
-
else
-
error = {error_class: e.class.to_s}
-
end
-
1
Utilities.log('http_error', error)
-
1
{}
-
end
-
end
-
-
1
def ok?
-
193
@ok
-
end
-
-
1
def socks_error?
-
40
@socks_error
-
end
-
-
1
def dom
-
7
@page
-
end
-
-
1
def query_parameters
-
1
CGI.parse(@uri.query)
-
end
-
-
1
def method_missing(method_name, *args, &block)
-
8639
methods_to_call = public_methods(false).select {|method| method.to_s.starts_with?('scrape_') && method =~ /^#{Regexp.quote method_name}_\d+$/}
-
107
if methods_to_call.present?
-
107
return nil unless ok?
-
-
105
methods_to_call.each do |method|
-
151
value = send method
-
151
return value if value.present?
-
end
-
-
nil
-
else
-
super
-
end
-
end
-
-
1
private
-
-
1
def force_absolute_url(url)
-
119
ScraperUtilities.force_absolute_url url, @base_url
-
end
-
end
-
1
class Report
-
1
STATUSES = [:new, :processing, :unable_to_process, :error_while_processing, :completed]
-
1
def self.collection
-
$mongodb.collection('reports')
-
end
-
-
1
def self.create(doc = {})
-
report = new doc
-
report.save
-
-
report
-
end
-
-
1
def self.find(id)
-
id = id.is_a?(String) && BSON::ObjectId.legal?(id) ? BSON::ObjectId.from_string(id) : id
-
record = collection.find(_id: id).limit(1).first
-
record.present? ? record['report_name'].constantize.new(record) : nil
-
end
-
-
1
def self.where(conditions = {})
-
conditions.merge report_name: self.class.to_s unless self.class == Report
-
collection.find(conditions).sort({created_at: -1}).collect do |record|
-
record['report_name'].constantize.new record
-
end
-
end
-
-
1
def self.start_batch(ids, batch_id)
-
query = ids.collect {|id| {_id: id}}
-
collection.update({'$or' => query}, {'$push' => {batch_ids: batch_id}, '$set' => {status: 'processing'}}, upsert: true) if query.present?
-
end
-
-
1
def initialize(doc = {})
-
@document = {_id: BSON::ObjectId.new, status: 'new', report_name: self.class.to_s, file_details: {}}.with_indifferent_access.merge doc
-
@document['batch_ids'] = Array.wrap(@document['batch_ids'])
-
existing_report = collection.find(_id: @document[:_id]).limit(1).first
-
@document = existing_report.with_indifferent_access if existing_report.present?
-
@document[:file_details] = @document[:file_details].with_indifferent_access
-
@document[:file_details].each {|key, val| @document[:file_details][key] = val.with_indifferent_access}
-
end
-
-
1
def id
-
@document['_id']
-
end
-
-
1
%w[report_name status created_at error params s3_url batch_ids file_details].each do |method_name|
-
8
define_method method_name do
-
@document[method_name]
-
end
-
end
-
-
1
def collection
-
@collection ||= Report.collection
-
end
-
-
1
def save
-
raise ArgumentError unless @document[:batch_ids].is_a?(Array)
-
-
@document['created_at'] ||= Time.current.to_time
-
collection.update({_id: id}, @document, upsert: true)
-
end
-
-
1
def destroy
-
collection.remove _id: id
-
end
-
end
-
1
class ReportBatch < Report
-
1
def initialize(extra = {})
-
extra = extra.with_indifferent_access
-
missing_keys = %i[job_type batch_params].select {|key| extra[key].blank?}
-
raise ArgumentError.new("Missing Keys: #{missing_keys.join(', ')}") unless missing_keys.blank?
-
-
extra.merge! job_type: extra[:job_type], batch_params: extra[:batch_params]
-
-
super extra
-
end
-
-
1
def params
-
@document.select {|key, _| %w[job_type batch_params].include? key}
-
end
-
-
1
def batch_params
-
@document[:batch_params]
-
end
-
-
1
def job_type
-
@document[:job_type]
-
end
-
end
-
1
class ReportCards::FishRichardson < ReportCards::ReportCard
-
1
@report_name = :fish_richardson
-
-
1
def initialize(email, ftp, gzip, enable_report_blocking)
-
super email, ftp, gzip, enable_report_blocking
-
-
@client_name = :fish_richardson
-
@report_name = self.class.report_name
-
@header = ['ISBN', 'Title', 'Author', 'Format', 'Published Date', 'Length', 'Width', 'Depth', 'Similar Item Category 1',
-
'Similar Item Category 2', 'Similar Item Category 3']
-
@report_row_keys = %w[warehouse_book_versions_isbn13 warehouse_book_versions_title warehouse_book_versions_author_name
-
warehouse_book_versions_book_format warehouse_book_versions_pub_date warehouse_book_versions_physical_details
-
warehouse_book_versions_physical_details warehouse_book_versions_physical_details
-
warehouse_stats_amazon_similar_item_category_tree_1 warehouse_stats_amazon_similar_item_category_tree_2
-
warehouse_stats_amazon_similar_item_category_tree_3]
-
@warehouse_book_version_ids = WarehouseBookVersion.ingested.com.where{physical_details != nil}.
-
where(book_format: %w[Paperback Hardcover]).where("physical_details LIKE '%inches%'").order(:id).value_of(:id)
-
@expected_count = nil
-
end
-
-
1
def output_row!(row_keys, row)
-
@warehouse_book_version_id_index ||= row_keys.index('warehouse_stats_warehouse_book_version_id')
-
@physical_details_index ||= row_keys.index('warehouse_book_versions_physical_details')
-
if @warehouse_book_version_ids.bsearch {|warehouse_book_version_id| row[@warehouse_book_version_id_index].to_i - warehouse_book_version_id}.present?
-
output_row = generate_output_row(row)
-
-
# output_row[5] [6] and [7] are the same field, physical_details, if it has 2 x's the data has 3 dimensions, if it has 1 x it has 2 dimensions
-
# and we assume its just length and width
-
-
if output_row[5].scan(' x ').count == 2
-
# set all 3 values then sort from biggest to smallest so depth always ends up last
-
dimensions = [output_row[5].split(' x ').first, output_row[6].split(' x ').second, output_row[7].split(' x ').third.split(' inches').first].map{|x| x.to_f}.sort.reverse
-
output_row[5] = dimensions[0]
-
output_row[6] = dimensions[1]
-
output_row[7] = dimensions[2]
-
elsif output_row[5].scan(' x ').count == 1
-
# set both values then sort from biggest to smallest so depth always ends up last
-
dimensions = [output_row[5].split(' x ').first, output_row[6].split(' x ').second.split(' inches').first].map{|x| x.to_f}.sort.reverse
-
output_row[5] = dimensions[0]
-
output_row[6] = dimensions[1].to_f > 2 ? dimensions[1] : 'N/A'
-
output_row[7] = dimensions[1].to_f < 2 ? dimensions[1] : 'N/A'
-
else
-
return false
-
end
-
-
@csv << as_csv_row(output_row)
-
-
true
-
end
-
end
-
-
1
def send_all_complete?
-
true
-
end
-
-
1
private
-
-
1
def base_filename
-
"fish-richardson-title-dimensions-report-#{@report_date.strftime('%m%d%y')}"
-
end
-
end
-
1
class ReportCards::RandomHouseCorporate < ReportCards::ReportCard
-
1
@report_name = :corporate
-
-
1
def initialize(email, ftp, gzip, enable_report_blocking)
-
6
super email, ftp, gzip, enable_report_blocking
-
-
6
@client_name = :rhinc
-
6
@report_name = self.class.report_name
-
6
@header = ['ISBN', 'Title', 'Author', 'Publisher', 'Pub Date', 'Format', 'Nook Price', 'BN Nook List Price', 'BN Rank',
-
'BN Date Stamp', 'BN Time Stamp', 'Amz Actual Price', 'Amz Digital List Price', 'Amz Print List Price',
-
'ASIN', 'Amz Sales Rank', 'Amz Category 1', 'Amz Category 1 Rnk', 'Amz Category 2', 'Amz Category 2 Rnk',
-
'Amz Category 3', 'Amz Category 3 Rnk', 'Amz Category 4', 'Amz Category 4 Rnk', 'Amz Date Stamp', 'Amz Time Stamp',
-
'BN ID', 'BN Physical List Price', 'goodreads_work_average_rating', 'goodreads_work_rating_count',
-
'goodreads_work_review_count', 'goodreads_work_added_by_count', 'goodreads_work_to_read_count',
-
'goodreads_edition_average_rating', 'goodreads_edition_rating_count', 'goodreads_edition_review_count',
-
'goodreads_edition_added_by_count', 'goodreads_work_5_star_count', 'goodreads_work_4_star_count',
-
'goodreads_work_3_star_count', 'goodreads_work_2_star_count', 'goodreads_work_1_star_count']
-
6
@expected_count = WarehouseBookVersion.com.ingested.count if enable_report_blocking
-
6
@report_row_keys = ['warehouse_book_versions_isbn13', 'warehouse_book_versions_title', 'warehouse_book_versions_author_name',
-
'warehouse_book_versions_publisher', 'warehouse_book_versions_pub_date', 'warehouse_book_versions_book_format',
-
'warehouse_stats_bn_nook_price', 'warehouse_stats_bn_nook_list_price', 'warehouse_stats_bn_sales_rank',
-
'warehouse_dates_date', '00:00:00', 'warehouse_stats_amazon_price', 'warehouse_stats_amazon_digital_list_price',
-
'warehouse_stats_amazon_list_price', 'warehouse_book_versions_asin', 'warehouse_stats_amazon_sales_rank',
-
'warehouse_amazon_category1_name', 'warehouse_stats_amazon_category1_rank', 'warehouse_amazon_category2_name',
-
'warehouse_stats_amazon_category2_rank', 'warehouse_amazon_category3_name', 'warehouse_stats_amazon_category3_rank',
-
'', '', 'warehouse_dates_date', '00:00:00', 'warehouse_book_versions_bn_id', 'warehouse_stats_bn_list_price',
-
'warehouse_stats_goodreads_work_average_rating', 'warehouse_stats_goodreads_work_rating_count',
-
'warehouse_stats_goodreads_work_review_count', 'warehouse_stats_goodreads_work_added_by_count',
-
'warehouse_stats_goodreads_work_to_read_count', 'warehouse_stats_goodreads_edition_average_rating',
-
'warehouse_stats_goodreads_edition_rating_count', 'warehouse_stats_goodreads_edition_review_count',
-
'warehouse_stats_goodreads_edition_added_by_count', 'warehouse_stats_goodreads_5_star_count',
-
'warehouse_stats_goodreads_4_star_count', 'warehouse_stats_goodreads_3_star_count',
-
'warehouse_stats_goodreads_2_star_count', 'warehouse_stats_goodreads_1_star_count']
-
end
-
-
1
def send_all_complete?
-
2
true
-
end
-
-
1
private
-
-
1
def base_filename
-
3
"random-house-corporate-report-v3-#{@report_date.strftime("%m%d%y")}"
-
end
-
end
-
1
class ReportCards::RandomHouseCorporate2 < ReportCards::ReportCard
-
1
@report_name = :corporate2
-
-
1
def initialize(email, ftp, gzip, enable_report_blocking)
-
super email, ftp, gzip, enable_report_blocking
-
-
@client_name = :rhinc
-
@report_name = self.class.report_name
-
@header = ['ISBN', 'Title', 'Author', 'Publisher', 'Pub Date', 'Format', 'Nook Price', 'BN Nook List Price', 'BN Rank',
-
'BN Date Stamp', 'BN Time Stamp', 'Amz Actual Price', 'Amz Digital List Price', 'Amz Print List Price',
-
'ASIN', 'Amz Sales Rank', 'Amz Category 1', 'Amz Category 1 Rnk', 'Amz Category 2', 'Amz Category 2 Rnk',
-
'Amz Category 3', 'Amz Category 3 Rnk', 'Amz Category 4', 'Amz Category 4 Rnk', 'Amz Date Stamp', 'Amz Time Stamp',
-
'BN ID', 'BN Physical List Price', 'goodreads_work_average_rating', 'goodreads_work_rating_count',
-
'goodreads_work_review_count', 'goodreads_work_added_by_count', 'goodreads_work_to_read_count',
-
'goodreads_edition_average_rating', 'goodreads_edition_rating_count', 'goodreads_edition_review_count',
-
'goodreads_edition_added_by_count', 'goodreads_work_5_star_count', 'goodreads_work_4_star_count',
-
'goodreads_work_3_star_count', 'goodreads_work_2_star_count', 'goodreads_work_1_star_count', 'kindle_unlimited']
-
@expected_count = WarehouseBookVersion.com.ingested.count if enable_report_blocking
-
@report_row_keys = ['warehouse_book_versions_isbn13', 'warehouse_book_versions_title', 'warehouse_book_versions_author_name',
-
'warehouse_book_versions_publisher', 'warehouse_book_versions_pub_date', 'warehouse_book_versions_book_format',
-
'warehouse_stats_bn_nook_price', 'warehouse_stats_bn_nook_list_price', 'warehouse_stats_bn_sales_rank',
-
'warehouse_dates_date', '00:00:00', 'warehouse_stats_amazon_price', 'warehouse_stats_amazon_digital_list_price',
-
'warehouse_stats_amazon_list_price', 'warehouse_book_versions_asin', 'warehouse_stats_amazon_sales_rank',
-
'warehouse_amazon_category1_name', 'warehouse_stats_amazon_category1_rank', 'warehouse_amazon_category2_name',
-
'warehouse_stats_amazon_category2_rank', 'warehouse_amazon_category3_name', 'warehouse_stats_amazon_category3_rank',
-
'', '', 'warehouse_dates_date', '00:00:00', 'warehouse_book_versions_bn_id', 'warehouse_stats_bn_list_price',
-
'warehouse_stats_goodreads_work_average_rating', 'warehouse_stats_goodreads_work_rating_count',
-
'warehouse_stats_goodreads_work_review_count', 'warehouse_stats_goodreads_work_added_by_count',
-
'warehouse_stats_goodreads_work_to_read_count', 'warehouse_stats_goodreads_edition_average_rating',
-
'warehouse_stats_goodreads_edition_rating_count', 'warehouse_stats_goodreads_edition_review_count',
-
'warehouse_stats_goodreads_edition_added_by_count', 'warehouse_stats_goodreads_5_star_count',
-
'warehouse_stats_goodreads_4_star_count', 'warehouse_stats_goodreads_3_star_count',
-
'warehouse_stats_goodreads_2_star_count', 'warehouse_stats_goodreads_1_star_count',
-
'warehouse_stats_kindle_unlimited']
-
end
-
-
1
def send_all_complete?
-
true
-
end
-
-
1
private
-
-
1
def base_filename
-
"random-house-corporate-report-v4-#{@report_date.strftime('%m%d%y')}"
-
end
-
end
-
1
class ReportCards::RandomHouseCorporateApple < ReportCards::ReportCard
-
1
@report_name = :daily_apple
-
-
1
def initialize(email, ftp, gzip, enable_report_blocking)
-
9
super email, ftp, gzip, enable_report_blocking
-
-
9
@client_name = :rhinc
-
9
@report_name = self.class.report_name
-
9
@header = ['ISBN', 'Title', 'Author', 'Publisher', 'Pub Date', 'Format', 'iTunes Price', 'iTunes Average Rating',
-
'iTunes Rating Count', 'Date Stamp', 'Time Stamp']
-
9
@report_row_keys = %w[warehouse_book_versions_isbn13 warehouse_book_versions_title warehouse_book_versions_author_name warehouse_book_versions_publisher
-
warehouse_book_versions_pub_date warehouse_book_versions_book_format warehouse_stats_itunes_price warehouse_stats_itunes_average_rating
-
warehouse_stats_itunes_rating_count warehouse_dates_date 00:00:00]
-
9
@warehouse_book_version_ids = WarehouseBookVersion.com.where('itunes_id is not null').ingested.order(:id).value_of(:id)
-
9
@expected_count = @warehouse_book_version_ids.count
-
end
-
-
1
def output_row!(row_keys, row)
-
5
@warehouse_book_version_id_index ||= row_keys.index('warehouse_stats_warehouse_book_version_id')
-
9
if @warehouse_book_version_ids.bsearch {|warehouse_book_version_id| row[@warehouse_book_version_id_index].to_i - warehouse_book_version_id}.present?
-
2
@csv << as_csv_row(generate_output_row(row))
-
-
2
true
-
end
-
end
-
-
1
def send_all_complete?
-
1
true
-
end
-
-
1
private
-
-
1
def base_filename
-
2
"rh-apple-report-#{@report_date.strftime("%m%d%y")}"
-
end
-
end
-
1
module ReportCards
-
1
class ReportCard
-
1
attr_writer :report_date
-
-
1
def self.report_name
-
38
@report_name
-
end
-
-
1
def self.report_card_class_by_report_name(report_name)
-
23
descendants.find {|klass| klass.report_name.to_s == report_name.to_s}
-
end
-
-
1
def self.send_rhinc_ftp_completion
-
# FTP Completion file
-
1
file = File.open('/tmp/ALLFILES.DONE', 'w')
-
1
ftp = Net::FTP.open('ftp.randomhouse.com', 'booklr', 'B00Klr')
-
1
ftp.passive = true
-
1
ftp.chdir 'to_rh'
-
1
ftp.putbinaryfile file.path
-
1
ftp.close
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Report Delivered to RH FTP Server (#{file.path})-----"}
-
-
1
File.delete(file)
-
1
file.close
-
end
-
-
1
def initialize(email, ftp, gzip, enable_report_blocking)
-
58
@email, @ftp, @gzip, @enable_report_blocking = [email, ftp, gzip, enable_report_blocking]
-
-
58
@csv = @client_name = @report_name = @header = @expected_count = nil
-
58
@report_row_keys = []
-
end
-
-
1
def report_key
-
36
"#{@client_name}_#{@report_name}"
-
end
-
-
1
def open_csv
-
10
file_location = File.join(AmazeBot.config[:reports][:location][Utilities.env], @client_name.to_s, filename)
-
10
dirname = File.dirname file_location
-
10
begin
-
10
Dir.mkdir dirname unless File.exists? dirname
-
rescue Errno::EEXIST
-
# Directory already exists
-
end
-
10
@csv = File.new file_location, 'wb'
-
10
@csv = Zlib::GzipWriter.new(@csv) if @gzip
-
end
-
-
1
def generate_row_key_indexes(row_keys)
-
10
@row_key_indexes = @report_row_keys.collect do |report_row_key|
-
139
report_row_key.split('|').collect do |key_part|
-
135
index = row_keys.index(key_part)
-
135
index.present? ? index : key_part
-
end
-
end
-
end
-
-
1
def get_row_count
-
2
$redis.hget('daily_report_stats', "#{report_key}_row_count").to_i
-
end
-
-
1
def set_row_count(count)
-
7
$redis.hmset('daily_report_stats', "#{report_key}_row_count", count)
-
end
-
-
1
def set_time_sent(time_sent)
-
8
$redis.hmset('daily_report_stats', "#{report_key}_completion_time", time_sent)
-
end
-
-
1
def insert_header!
-
8
@csv << as_csv_row(@header) if @header.present?
-
end
-
-
1
def output_row!(row_keys, row)
-
5
csv_row = as_csv_row(generate_output_row(row))
-
5
@csv << csv_row
-
-
5
true
-
end
-
-
1
def finalize_output!
-
7
@csv.flush
-
7
@csv.close
-
end
-
-
1
def move_to_s3
-
2
uploader = ReportUploader.new
-
2
uploader.client_name = @client_name
-
2
uploader.store! File.new(@csv.path)
-
-
2
true
-
end
-
-
1
def deliver_report
-
3
if row_count_valid?
-
2
ftp_to_client
-
2
email_client
-
4
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_key} Report Delivered -----"}
-
2
set_time_sent Time.current.to_s
-
else
-
1
NotificationMailer.report_row_count_error(report_key, "Report blocked from sending and FTPing because todays count: #{get_row_count} was too far off the expected count: #{@expected_count}").deliver
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_key} Report NOT Delivered: row count off by 0.5% of more -----"}
-
end
-
end
-
-
1
def send_all_complete?
-
1
false
-
end
-
-
1
private
-
-
1
def generate_output_row(row)
-
7
@row_key_indexes.collect do |indexes|
-
106
value = nil
-
208
indexes.each {|index| value ||= index.is_a?(Integer) ? row[index].presence : index}
-
-
106
value
-
end
-
end
-
-
1
def as_csv_row(array)
-
11
"#{array.join(',')}\n"
-
end
-
-
1
def base_filename
-
nil
-
end
-
-
1
def filename
-
13
@gzip ? "#{base_filename}.csv.gz" : "#{base_filename}.csv"
-
end
-
-
1
def row_count_valid?
-
5
!@enable_report_blocking || @expected_count.nil? || ((@expected_count - get_row_count) / @expected_count.to_f).abs < 0.05
-
end
-
-
1
def ftp_to_client
-
5
if Rails.env.production? && @ftp
-
2
if @client_name.to_sym == :rhinc
-
# FTP CSV File
-
1
ftp = Net::FTP.open('ftp.randomhouse.com', 'booklr', 'B00Klr')
-
1
ftp.passive = true
-
1
ftp.chdir 'to_rh'
-
1
ftp.putbinaryfile @csv.path
-
1
ftp.close
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Report Delivered to RH FTP Server (#{@csv.path})-----"}
-
end
-
end
-
end
-
-
1
def email_client
-
4
return unless @email && @report_name.present?
-
-
1
report_hash = {
-
base_filename: base_filename,
-
filename: filename,
-
report_format: 'csv',
-
client_name: @client_name
-
}
-
1
EnterpriseReportsMailer.basic_report(report_hash, AmazeBot.config[:reports][:clients][@client_name][:reports][@report_name].with_indifferent_access).deliver
-
end
-
end
-
end
-
1
class ReportCards::RhincCustomerBehavior < ReportCards::ReportCard
-
1
@report_name = :customer_behavior
-
-
1
def initialize(email, ftp, gzip, enable_report_blocking)
-
3
super email, ftp, gzip, enable_report_blocking
-
-
3
@client_name = :rhinc
-
3
@report_name = self.class.report_name
-
3
@header = %w[bn_review_count bn_average_rating amazon_review_count amazon_average_rating one_star_count two_star_count
-
three_star_count four_star_count five_star_count amazon_likes] +
-
WarehouseStat::WAREHOUSE_AMAZON_ALSO_BOUGHT_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_BOUGHT_AFTER_VIEWING_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_FREQUENTLY_BOUGHT_TOGETHER_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_ALSO_BOUGHT_ITEMS_BY_FIELDS +
-
WarehouseStat::WAREHOUSE_BN_ALSO_BOUGHT_FIELDS +
-
%w[overall_author_rank sub_category1_author_rank_id_name sub_category1_author_rank sub_category2_author_rank_id_name
-
sub_category2_author_rank sub_category3_author_rank_id_name sub_category3_author_rank sub_category4_author_rank_id_name
-
sub_category4_author_rank]
-
3
@report_row_keys = (%w[bn_review_count bn_average_rating amazon_review_count amazon_average_rating one_star_count two_star_count
-
three_star_count four_star_count five_star_count amazon_likes] +
-
WarehouseStat::WAREHOUSE_AMAZON_ALSO_BOUGHT_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_BOUGHT_AFTER_VIEWING_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_FREQUENTLY_BOUGHT_TOGETHER_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_ALSO_BOUGHT_ITEMS_BY_FIELDS +
-
396
WarehouseStat::WAREHOUSE_BN_ALSO_BOUGHT_FIELDS).collect {|name| "warehouse_stats_#{name}"} +
-
%w[warehouse_stats_overall_author_rank
-
sub_category1_author_rank_id_name|warehouse_stats_sub_category1_author_rank_id_fallback warehouse_stats_sub_category1_author_rank
-
sub_category2_author_rank_id_name|warehouse_stats_sub_category2_author_rank_id_fallback warehouse_stats_sub_category2_author_rank
-
sub_category3_author_rank_id_name|warehouse_stats_sub_category3_author_rank_id_fallback warehouse_stats_sub_category3_author_rank
-
sub_category4_author_rank_id_name|warehouse_stats_sub_category4_author_rank_id_fallback warehouse_stats_sub_category4_author_rank]
-
end
-
-
1
def send_all_complete?
-
1
true
-
end
-
-
1
private
-
-
1
def base_filename
-
1
"rhinc-all-customer-behavior-#{@report_date.strftime("%m%d%y")}"
-
end
-
end
-
1
class SimilarBookPricingReport < Report
-
1
include ReportUtilities
-
-
1
def initialize(extra = {})
-
@client_name = :booklr
-
@extra_folders = %w[big-data-reports price-reports]
-
extra = extra.with_indifferent_access
-
missing_keys = []
-
missing_keys << :warehouse_date_id if extra[:warehouse_date_id].blank?
-
missing_keys << :asin << :isbn13 if extra[:asin].blank? && extra[:isbn13].blank?
-
raise ArgumentError.new("Missing Keys: #{missing_keys.join(', ')}") unless missing_keys.blank?
-
-
extra.merge! _id: "similar-book-pricing-#{extra[:asin] || 'X'}-#{extra[:isbn13] || 'X'}-#{extra[:warehouse_date_id]}", asin: extra[:asin], isbn13: extra[:isbn13], warehouse_date_id: extra[:warehouse_date_id]
-
-
super extra
-
end
-
-
1
def params
-
@document.select {|key, _| %w[asin isbn13 warehouse_date_id].include? key}
-
end
-
-
1
def generate
-
warehouse_date = WarehouseDate.find @document[:warehouse_date_id]
-
book_version = WarehouseBookVersion.find_by(asin: @document[:asin], tld: '.com')
-
stat = WarehouseStat.find_by(warehouse_book_version_id: book_version.id, warehouse_date_id: warehouse_date.id)
-
if stat.blank?
-
@document[:error] = 'Book version is valid but has no data for today, let your big data helpers know so they can investigate. Error in Top100ProjectedRankReport, this should never happen.'
-
@document[:status] = :error_while_processing
-
else
-
average_price_report_name = "similar-top-100-book-pricing-#{@document[:asin]}-#{warehouse_date.date.strftime('%m%d%y')}"
-
price_frequency_report_name = "#{average_price_report_name}-frequency-bar-chart"
-
similar_categories = stat.all_similar_item_categories
-
if similar_categories.present?
-
all_list_stat_prices = []
-
list_stat_price_distribution = WarehouseListStat.where(warehouse_category_id: similar_categories.collect(&:id), warehouse_date_id: warehouse_date.id).where.not(asin: book_version.asin).each_with_object({}) do |list_stat, hash|
-
if list_stat.price.present?
-
hash[list_stat.price] ||= 0
-
hash[list_stat.price] += 1
-
all_list_stat_prices << list_stat.price.to_i
-
end
-
end
-
s3_url = output_average_price_report average_price_report_name, warehouse_date.date, book_version, stat, all_list_stat_prices, similar_categories
-
@document[:file_details][average_price_report_name] = {s3_url: s3_url}
-
-
s3_url = output_price_frequency_report price_frequency_report_name, warehouse_date.date, book_version, stat, list_stat_price_distribution, similar_categories
-
@document[:file_details][price_frequency_report_name] = {s3_url: s3_url}
-
else
-
@document[:file_details][average_price_report_name] = {error: :unable_to_generate, message: 'Book had no similar item categories'}
-
@document[:file_details][price_frequency_report_name] = {error: :unable_to_generate, message: 'Book had no similar item categories'}
-
end
-
-
average_price_report_name = "similar-also-bought-book-pricing-#{@document[:asin]}-#{warehouse_date.date.strftime('%m%d%y')}"
-
price_frequency_report_name = "#{average_price_report_name}-frequency-bar-chart"
-
also_bought_asins = 6.times.collect {|num| stat.send("amazon_also_bought_asin_#{num + 1}")}.compact
-
if also_bought_asins.present?
-
book_versions = WarehouseBookVersion.where(asin: also_bought_asins, tld: '.com')
-
if book_versions.present?
-
all_also_bought_prices = []
-
also_bought_price_distribution = WarehouseStat.where(warehouse_book_version_id: book_versions.collect(&:id), warehouse_date_id: warehouse_date.id).each_with_object({}) do |stat, hash|
-
if stat.amazon_price.present?
-
hash[stat.amazon_price] ||= 0
-
hash[stat.amazon_price] += 1
-
all_also_bought_prices << stat.amazon_price.to_i
-
end
-
end
-
s3_url = output_average_price_report average_price_report_name, warehouse_date.date, book_version, stat, all_also_bought_prices
-
@document[:file_details][average_price_report_name] = {s3_url: s3_url}
-
-
s3_url = output_price_frequency_report price_frequency_report_name, warehouse_date.date, book_version, stat, also_bought_price_distribution
-
@document[:file_details][price_frequency_report_name] = {s3_url: s3_url}
-
end
-
else
-
@document[:file_details][average_price_report_name] = {error: :unable_to_generate, message: 'Book had no also bought data'}
-
@document[:file_details][price_frequency_report_name] = {error: :unable_to_generate, message: 'Book had no also bought data'}
-
end
-
-
@document[:status] = :completed
-
end
-
-
save
-
end
-
-
1
private
-
-
1
def output_average_price_report(report_name, report_date, book_version, stat, all_prices, similar_categories = nil)
-
price_report_hash = EnterpriseReports.generate_report_hash(report_name, @client_name)
-
csv = EnterpriseReports.open_csv(price_report_hash)
-
-
csv << %w[Title Author Format Asin Price Date]
-
csv << [book_version.title, book_version.author_name, book_version.book_format, book_version.asin, "$#{as_price stat.amazon_price}", report_date.to_s]
-
if similar_categories.present?
-
csv << []
-
csv << ['Categories used']
-
similar_categories.each {|category| csv << [category.name]}
-
end
-
csv << ["Sample Size: #{all_prices.size} books"]
-
csv << []
-
csv << ['Price Mean', 'Price Median', 'Price Mode']
-
csv << ["$#{as_price all_prices.mean}", "$#{as_price all_prices.median}", all_prices.mode.collect {|price| "$#{as_price price}"}.join(', ')]
-
-
s3_url = EnterpriseReports.move_to_s3(@client_name, csv, @extra_folders)
-
csv.close
-
-
s3_url
-
end
-
-
1
def output_price_frequency_report(report_name, report_date, book_version, stat, price_distribution, similar_categories = nil)
-
frequency_report_hash = EnterpriseReports.generate_report_hash(report_name, @client_name)
-
csv = EnterpriseReports.open_csv(frequency_report_hash)
-
-
csv << %w[Title Author Format Asin Price Date]
-
csv << [book_version.title, book_version.author_name, book_version.book_format, book_version.asin, "$#{as_price stat.amazon_price}", report_date.to_s]
-
if similar_categories.present?
-
csv << []
-
csv << ['Categories used']
-
similar_categories.each {|category| csv << [category.name]}
-
end
-
csv << []
-
csv << %w[Price Frequency]
-
price_distribution.sort.each {|price, frequency| csv << ["$#{as_price price}", frequency]}
-
csv << []
-
csv << ['Price Range', 'Frequency']
-
range = (price_distribution.keys.sort.first / 100.0).floor..(price_distribution.keys.sort.last / 100.0).ceil
-
range.each_slice([range.count / 10, 1].max) do |slice|
-
csv << ["$#{as_price(slice.first * 100)} - $#{as_price(slice.last * 100 + 99)}", price_distribution.select {|price, _| price >= slice.first * 100 && price <= slice.last * 100 + 99}.values.sum]
-
end
-
-
s3_url = EnterpriseReports.move_to_s3(@client_name, csv, @extra_folders)
-
csv.close
-
-
s3_url
-
end
-
end
-
1
class Top100PriceDistributionReport < Report
-
1
include ReportUtilities
-
-
1
def initialize(extra = {})
-
@client_name = :booklr
-
extra = extra.with_indifferent_access
-
raise ArgumentError.new('Missing Keys: warehouse_date_id') unless extra[:warehouse_date_id].present?
-
-
extra.merge! _id: "top100-price-distribution-#{extra[:warehouse_date_id]}", warehouse_date_id: extra[:warehouse_date_id]
-
-
super extra
-
end
-
-
1
def params
-
@document.select {|key, _| %w[warehouse_date_id].include? key}
-
end
-
-
1
def generate
-
price_distribution = {}
-
major_publisher_price_distribution = {}
-
other_publisher_price_distribution = {}
-
amazon_list_price_distribution = {}
-
major_publisher_amazon_list_price_distribution = {}
-
other_publisher_amazon_list_price_distribution = {}
-
warehouse_date = WarehouseDate.find @document[:warehouse_date_id]
-
major_publishers = ActiveRecord::Base.connection.execute('select type, value from warehouse_major_publishers').each_with_object({sold_by: [], publisher: []}) do |row, hash|
-
hash[:sold_by] << row['value'] if row['type'] == 'sold_by'
-
hash[:publisher] << row['value'] if row['type'] == 'publisher'
-
end
-
prices_by_warehouse_book_version_id = WarehouseStat.where(warehouse_date_id: @document[:warehouse_date_id]).value_of(:amazon_list_price, :warehouse_book_version_id).each_with_object({}) do |price_and_id, hash|
-
hash[price_and_id[1]] = price_and_id[0]
-
end
-
-
sql = WarehouseListStat.joins('left outer join warehouse_categories on warehouse_list_stats.warehouse_category_id = warehouse_categories.id left outer join warehouse_book_versions on warehouse_list_stats.warehouse_book_version_id = warehouse_book_versions.id').
-
select('warehouse_categories.name as category_name, warehouse_book_version_id, price, sold_by, publisher').
-
where(warehouse_date_id: @document[:warehouse_date_id]).to_sql
-
ActiveRecord::Base.connection.execute(sql).values.each do |category_name, warehouse_book_version_id, price, sold_by, publisher|
-
major_publisher = major_publishers[:sold_by].include?(sold_by) || major_publishers[:publisher].include?(publisher)
-
if price.present?
-
add_frequency_data_to_hash price_distribution, category_name, price
-
add_frequency_data_to_hash major_publisher_price_distribution, category_name, price if major_publisher
-
add_frequency_data_to_hash other_publisher_price_distribution, category_name, price unless major_publisher
-
end
-
-
amazon_list_price = prices_by_warehouse_book_version_id[warehouse_book_version_id.to_i]
-
if amazon_list_price.present?
-
add_frequency_data_to_hash amazon_list_price_distribution, category_name, amazon_list_price
-
add_frequency_data_to_hash major_publisher_amazon_list_price_distribution, category_name, amazon_list_price if major_publisher
-
add_frequency_data_to_hash other_publisher_amazon_list_price_distribution, category_name, amazon_list_price unless major_publisher
-
end
-
end
-
-
output_report "price-distribution-by-category-#{warehouse_date.date.strftime('%m%d%y')}", price_distribution
-
output_report "major-publisher-price-distribution-by-category-#{warehouse_date.date.strftime('%m%d%y')}", major_publisher_price_distribution
-
output_report "other-publisher-price-distribution-by-category-#{warehouse_date.date.strftime('%m%d%y')}", other_publisher_price_distribution
-
output_report "amazon-list-price-distribution-by-category-#{warehouse_date.date.strftime('%m%d%y')}", amazon_list_price_distribution
-
output_report "major-publisher-amazon-list-price-distribution-by-category-#{warehouse_date.date.strftime('%m%d%y')}", major_publisher_amazon_list_price_distribution
-
output_report "other-publisher-amazon-list-price-distribution-by-category-#{warehouse_date.date.strftime('%m%d%y')}", other_publisher_amazon_list_price_distribution
-
-
client_config = AmazeBot.config[:reports][:clients][@client_name]
-
InternalReportsMailer.basic_report(id, @document[:file_details], client_config[:reports][:top100_price_distribution]).deliver
-
-
@document[:status] = :completed
-
save
-
end
-
-
1
private
-
-
1
def add_frequency_data_to_hash(distribution_hash, category_name, price)
-
distribution_hash[category_name] ||= {}
-
distribution_hash[category_name][:prices] ||= []
-
distribution_hash[category_name][:prices] << price
-
distribution_hash[category_name][:price_frequencies] ||= {}
-
distribution_hash[category_name][:price_frequencies][price] ||= 0
-
distribution_hash[category_name][:price_frequencies][price] += 1
-
end
-
-
1
def output_report(report_name, distribution_hash)
-
if distribution_hash.present?
-
file_location_hash = {report_location: EnterpriseReports.get_report_location(@client_name, EnterpriseReports.get_filename(report_name))}
-
range_frequencies = {}
-
global_range_frequencies = {}
-
-
distribution_hash.each do |category_name, frequency_hash|
-
((0..19).to_a + (20..99).step(10).to_a + (100..999).step(100).to_a + [1000, 1000000]).each_cons(2) do |range_start, range_end|
-
range_start = 0.01 if range_start == 0
-
range_frequencies[category_name] ||= {}
-
price_count = frequency_hash[:price_frequencies].select {|price, _| price.to_i >= range_start * 100 && price.to_i <= range_end * 100 - 1}.values.sum
-
if price_count.present?
-
range_key = "$#{as_price(range_start * 100)} - $#{as_price(range_end * 100 - 1)}"
-
range_frequencies[category_name][range_key] = price_count
-
global_range_frequencies[range_key] ||= []
-
global_range_frequencies[range_key] << price_count
-
end
-
end
-
end
-
-
csv = EnterpriseReports.open_csv(file_location_hash)
-
csv << ['Category Name'] + range_frequencies.first[1].keys
-
range_frequencies.each do |name, frequencies|
-
csv << [name] + frequencies.values
-
end
-
csv << %w[Averages] + global_range_frequencies.values.collect(&:mean)
-
csv << []
-
csv << ['Category Name'] + range_frequencies.first[1].keys
-
-
s3_url = EnterpriseReports.move_to_s3(@client_name, csv, %w[big-data-reports price-distributions])
-
csv.close
-
@document[:file_details][report_name] = {s3_url: s3_url}
-
else
-
@document[:file_details][report_name] = {error: :unable_to_generate, message: 'No price distribution data'}
-
end
-
end
-
end
-
1
class Top100ProjectedRankReport < Report
-
1
def initialize(extra = {})
-
@client_name = :booklr
-
extra = extra.with_indifferent_access
-
missing_keys = []
-
missing_keys << :warehouse_date_id if extra[:warehouse_date_id].blank?
-
missing_keys << :asin << :isbn13 if extra[:asin].blank? && extra[:isbn13].blank?
-
raise ArgumentError.new("Missing Keys: #{missing_keys.join(', ')}") unless missing_keys.blank?
-
-
extra.merge! _id: "top100-projected-rank-#{extra[:asin] || 'X'}-#{extra[:isbn13] || 'X'}-#{extra[:warehouse_date_id]}", asin: extra[:asin], isbn13: extra[:isbn13], warehouse_date_id: extra[:warehouse_date_id]
-
-
super extra
-
end
-
-
1
def params
-
@document.select {|key, _| %w[asin isbn13 warehouse_date_id].include? key}
-
end
-
-
1
def generate
-
warehouse_date = WarehouseDate.find @document[:warehouse_date_id]
-
book_version = WarehouseBookVersion.find_by(asin: @document[:asin], tld: '.com')
-
stat = WarehouseStat.find_by(warehouse_book_version_id: book_version.id, warehouse_date_id: warehouse_date.id)
-
if stat.blank?
-
@document[:error] = 'Book version is valid but has no data for today, let your big data helpers know so they can investigate. Error in Top100ProjectedRankReport, this should never happen.'
-
@document[:status] = :error_while_processing
-
elsif stat.amazon_sales_rank.blank?
-
@document[:error] = 'Daily stat has no amazon sales rank, cannot generate report'
-
@document[:status] = :error_while_processing
-
else
-
report_name = "related-top100-ranking-#{book_version.author_name.parameterize}-#{@document[:asin]}-#{warehouse_date.date.strftime('%m%d%y')}"
-
report_hash = EnterpriseReports.generate_report_hash(report_name, @client_name)
-
csv = EnterpriseReports.open_csv(report_hash)
-
-
csv << ['Title', 'Author', 'Format', 'Asin', 'Sales Rank', 'Date']
-
csv << [book_version.title, book_version.author_name, book_version.book_format, book_version.asin, stat.amazon_sales_rank, warehouse_date.date.to_s]
-
-
column_headers = ['Category Name', 'Currently Ranked?', 'Your Theoretical Position', 'Best Sales Rank', 'Sales Rank Ahead of You', 'Sales Rank Behind You', 'Worst Sales Rank', '% Improvement In Sales Rank Needed to Enter Top 100']
-
insert_projection_details csv, ['Related Categories'], column_headers, stat.get_top100_rank_projections(:related_categories), stat.amazon_sales_rank
-
insert_projection_details csv, ['Extended Related Categories'], column_headers, stat.get_top100_rank_projections(:extended_related_categories), stat.amazon_sales_rank
-
insert_projection_details csv, ['Similar Categories'], column_headers, stat.get_top100_rank_projections(:similar_categories), stat.amazon_sales_rank
-
# insert_projection_details csv, ['Categories From Similar Books'], column_headers, stat.get_top100_rank_projections(:similar_books), stat
-
-
@document[:file_details][report_name] = {s3_url: EnterpriseReports.move_to_s3(@client_name, csv, ['big-data-reports', warehouse_date.date.strftime('%m%d%y')])}
-
csv.close
-
@document[:status] = :completed
-
end
-
-
save
-
end
-
-
1
private
-
-
1
def insert_projection_details(csv, major_header, column_headers, projection_details, amazon_sales_rank)
-
csv << []
-
csv << major_header if major_header.present?
-
csv << column_headers
-
projection_details.sort_by {|_, position_data| position_data[:name]}.each do |_, position_data|
-
csv << [position_data[:name],
-
position_data[:currently_ranked] ? 'X' : nil,
-
(position_data[:position] == 101 ? 'Unranked' : position_data[:position]),
-
position_data[:best_sales_rank],
-
position_data[:next_sales_rank],
-
(position_data[:prev_sales_rank] == 0 ? nil : position_data[:prev_sales_rank]),
-
position_data[:worst_sales_rank],
-
(position_data[:position] == 101 ? "#{((amazon_sales_rank - position_data[:next_sales_rank] - 1) / amazon_sales_rank.to_f * 100).round}%" : nil)]
-
end
-
end
-
end
-
1
class TrackedBookVersion < ActiveRecord::Base
-
# Attributes
-
-
1
attr_accessible :warehouse_book_version, :warehouse_book_version_id, :user, :user_id, :metadata
-
-
1
serialize :metadata
-
-
# Associations
-
-
1
belongs_to :warehouse_book_version, inverse_of: :tracked_book_versions
-
1
belongs_to :user, inverse_of: :tracked_book_versions
-
-
# Validations
-
-
1
validates_presence_of :warehouse_book_version_id, :user_id
-
1
validates_uniqueness_of :warehouse_book_version_id, scope: :user_id
-
-
# Miscellaneous
-
-
1
def get_book_version_asin
-
15
warehouse_book_version.asin || metadata.try(:[], 0) || 'No data'
-
end
-
-
1
def get_book_version_isbn13
-
1
warehouse_book_version.isbn13
-
end
-
-
1
def get_book_title
-
9
warehouse_book_version.title || metadata.try(:[], 1) || 'No data'
-
end
-
-
1
def get_author_name
-
9
warehouse_book_version.author_name || metadata.try(:[], 2) || 'No data'
-
end
-
-
1
def get_division_code
-
8
metadata.try(:[], 3) || 'No data'
-
end
-
-
1
def get_book_format
-
8
warehouse_book_version.book_format || 'No data'
-
end
-
-
1
def match_url_on_search_page_from_metadata_asin(urls)
-
4
urls.select {|url| get_book_version_asin.length == 10 && ScraperUtilities.extract_asin_from_url(url) == get_book_version_asin}.first
-
end
-
end
-
1
class User < ActiveRecord::Base
-
# Include default devise modules. Others available are:
-
# :token_authenticatable, :encryptable, :confirmable, :lockable, :timeoutable and :omniauthable
-
1
devise :database_authenticatable, :rememberable, :trackable, :validatable
-
-
# Setup accessible (or protected) attributes for your model
-
1
attr_accessible :email, :password, :password_confirmation, :remember_me, :name, :validate_tracked_book_versions, :validate_tracked_book_versions_on_itunes, :tld
-
-
# Associations
-
-
1
has_many :tracked_book_versions, inverse_of: :user, dependent: :destroy
-
1
has_many :warehouse_book_versions, through: :tracked_book_versions
-
-
# Only add roles to the end of this array
-
# Check out Railscast #189
-
1
bitmask :roles_mask, :values => ['admin'], :as => :roles
-
-
# Validations
-
-
1
validates_presence_of :tld, :name
-
1
validates_inclusion_of :tld, in: Utilities::TLDS
-
-
1
def has_role?(role)
-
4
roles.include? role.to_s
-
end
-
-
1
def admin?
-
2
has_role? :admin
-
end
-
end
-
1
class WarehouseBookVersion < ActiveRecord::Base
-
1
mount_uploader :book_version_image, BookVersionImageUploader
-
-
1
STATUSES = [:new, :validated, :validated_from_top_100s, :ready_for_amazon_ingestion, :ingested, :page_not_found, :throttled, :external_error, :invalid_key, :invalid_key_type, :invalid_on_amazon, :no_author, :no_format, :no_results, :ambiguous_results, :duplicate_asin, :duplicate_isbn13, :duplicate_bn_id, :misassigned_asin, :api_call_in_progress].freeze
-
-
1
attr_accessible :asin, :bn_id, :book_format, :isbn13, :pub_date, :status, :title, :publisher, :sold_by, :pages, :physical_details, :author_name, :author_asin, :itunes_id, :tld, :duplicate_key, :source, :itunes_pub_date, :itunes_genres, :canonical_amazon_url, :canonical_bn_url, :canonical_goodreads_url, :amazon_book_description
-
-
# Associations
-
1
has_many :tracked_book_versions, inverse_of: :warehouse_book_version, dependent: :destroy
-
1
has_many :users, through: :tracked_book_versions
-
1
has_many :book_version_exceptions, inverse_of: :warehouse_book_version, dependent: :destroy
-
1
has_many :warehouse_stats, inverse_of: :warehouse_book_version, dependent: :destroy
-
1
has_many :warehouse_list_stats, inverse_of: :warehouse_book_version
-
1
has_many :book_version_exceptions, inverse_of: :warehouse_book_version, dependent: :destroy
-
1
has_one :amazon_api_response, inverse_of: :warehouse_book_version, dependent: :destroy
-
1
has_many :amazon_api_response_items, through: :amazon_api_response
-
1
has_many :amazon_api_lookup_response_items, lambda {order :response_rank}, through: :amazon_api_response
-
1
has_many :amazon_api_search_response_items, lambda {order :response_rank}, through: :amazon_api_response
-
1
has_many :book_version_categories, inverse_of: :warehouse_book_version
-
1
has_many :best_rank_category_stats, inverse_of: :best_rank_book_version
-
1
has_many :worst_rank_category_stats, inverse_of: :worst_rank_book_version
-
-
# Validations
-
1
validates_uniqueness_of :asin, allow_nil: true, scope: :tld
-
1
validates_uniqueness_of :isbn13, allow_nil: true, scope: :tld
-
1
validates_uniqueness_of :bn_id, allow_nil: true, scope: :tld
-
1
validates_uniqueness_of :itunes_id, allow_nil: true, scope: :tld
-
1
validates_presence_of :status, :tld
-
1
validates_inclusion_of :tld, in: Utilities::TLDS
-
1
validates_inclusion_of :status, in: STATUSES
-
-
# Scopes
-
29
scope :ingested, lambda {where(status: 'ingested')}
-
1
scope :not_ingested, lambda {where{status != :ingested}}
-
33
scope :com, lambda {where(tld: '.com')}
-
1
scope :couk, lambda {where(tld: '.co.uk')}
-
-
6
scope :statable, lambda {ingested}
-
5
scope :amazon_statable, lambda {statable.where{asin != nil}}
-
3
scope :bn_statable, lambda {statable.where{(isbn13 != nil) | (bn_id != nil)}}
-
3
scope :itunes_statable, lambda {statable.where{itunes_id != nil}}
-
3
scope :goodreads_statable, lambda {statable.where{(asin != nil) | (canonical_goodreads_url != nil) | (isbn13 != nil) | (bn_id != nil)}}
-
-
# Callbacks
-
-
1
after_commit :validate_on_amazon, on: :create
-
1
after_commit :add_asin_to_mongo_asin_list, on: :create
-
-
1
def validate_on_amazon
-
2
BookVersionValidationWorkers::ValidateNewBookVersion.perform_async id if status == :new
-
end
-
-
1
def add_asin_to_mongo_asin_list
-
2
MongoUtilities.add_documents_to_all_asin_list(MongoUtilities.all_asin_document asin, tld) if asin.present?
-
end
-
-
# Miscellaneous
-
-
1
def self.scope_to_worker_class(scope)
-
33
case scope.to_s
-
when 'amazon_statable'
-
20
MongoWorkers::GetAmazonProductPageStats
-
when 'bn_statable'
-
4
MongoWorkers::GetBarnesAndNobleStats
-
when 'itunes_statable'
-
4
MongoWorkers::GetItunesStats
-
when 'goodreads_statable'
-
4
MongoWorkers::GetGoodreadsStats
-
else
-
1
raise ArgumentError.new('Bad scraping scope')
-
end
-
end
-
-
1
def ingested?
-
2
status == :ingested
-
end
-
-
1
def status
-
4014
read_attribute(:status).try(:to_sym)
-
end
-
-
1
def isbn_or_asin
-
86
isbn13 || asin
-
end
-
-
1
def ean
-
5
isbn13 || bn_id
-
end
-
-
1
def amazon_url
-
3
Urls.amazon_book_page asin, tld
-
end
-
-
1
def bn_url
-
1
Urls.bn_book_page ean
-
end
-
-
1
def copy_associations_from(other_warehouse_book_version)
-
4
users_diff = other_warehouse_book_version.users - self.users
-
4
self.users << users_diff if users_diff.present?
-
4
BookVersionException.where(warehouse_book_version_id: other_warehouse_book_version.id).update_all(warehouse_book_version_id: self.id)
-
4
WarehouseListStat.where(warehouse_book_version_id: other_warehouse_book_version.id).update_all(warehouse_book_version_id: self.id)
-
end
-
-
1
def resolve_duplicate_key(existing_warehouse_book_version, duplicate_key)
-
3
other_key = (duplicate_key == :isbn13) ? :asin : :isbn13
-
-
3
if existing_warehouse_book_version.status != :ingested
-
# Transfer associations from existing book_version to this one
-
1
copy_associations_from(existing_warehouse_book_version)
-
1
existing_warehouse_book_version.destroy
-
1
false
-
2
elsif existing_warehouse_book_version.send(other_key).blank?
-
# Transfer associations from this book_version to existing ingested book
-
1
existing_warehouse_book_version.copy_associations_from(self)
-
1
key_to_transfer = self.send(other_key)
-
1
destroy
-
1
existing_warehouse_book_version.update_attributes(:"#{other_key}" => key_to_transfer)
-
1
true
-
1
elsif existing_warehouse_book_version.send(other_key) != self.send(other_key)
-
# if the corresponding key doesn't match on both then set this as a duplicate status and set its duplicate key so we can resolve it later
-
1
self.duplicate_key = self.send(duplicate_key)
-
1
self.status = :"duplicate_#{duplicate_key}"
-
1
send "#{duplicate_key}=", nil
-
1
false
-
end
-
end
-
-
1
def update_amazon_api_response(options = {})
-
15
if options[:items].blank?
-
12
key = options[:key_method].present? ? self.send(options[:key_method]) : isbn_or_asin
-
12
key_type = options[:key_method] || Utilities.determine_key_type(isbn_or_asin)
-
12
items_hash = AmazonApi.get_all_items_by_keys_and_tld(key, key_type, tld)
-
12
items = items_hash[key]
-
else
-
3
items = options[:items]
-
end
-
-
15
if amazon_api_response.blank?
-
13
create_amazon_api_response
-
else
-
2
AmazonAPIResponseItem.where(amazon_api_response_id: amazon_api_response.id).delete_all
-
2
amazon_api_response.touch
-
end
-
-
15
[:lookup, :search].each do |query_type|
-
30
num = 1
-
30
items[query_type].each do |item|
-
3
attributes = [:asin, :author, :binding, :brand, :creator, :ean, :eisbn,
-
:isbn, :item_dimensions_height, :item_dimensions_height_unit, :item_dimensions_length, :item_dimensions_length_unit,
-
:item_dimensions_weight, :item_dimensions_weight_unit, :item_dimensions_width, :item_dimensions_width_unit,
-
:label, :large_image_url, :list_price_amount, :list_price_currency_code, :manufacturer, :medium_image_url,
-
:number_of_pages, :package_dimensions_height, :package_dimensions_height_unit, :package_dimensions_length,
-
:package_dimensions_length_unit, :package_dimensions_weight, :package_dimensions_weight_unit, :package_dimensions_width,
-
:package_dimensions_width_unit, :publication_date, :publisher, :sales_rank, :small_image_url, :studio,
-
:title].each_with_object({}) do |attribute, hash|
-
111
hash[attribute] = AmazonApi.send "get_#{attribute}_from_item", item
-
end
-
3
amazon_api_response.amazon_api_response_items.create attributes.merge({response_rank: num, query_type: query_type})
-
3
num += 1
-
end
-
end
-
-
15
if status != :ingested
-
14
if items[:status] == :throttled
-
1
update_attributes status: :validated
-
elsif items[:status] != :external_error
-
12
update_attributes status: items[:status]
-
end
-
end
-
-
15
items[:status]
-
end
-
-
1
def ingest
-
11
if status == :ready_for_amazon_ingestion
-
10
self.status = populate_attributes_from_amazon_api
-
10
if status == :ingested
-
8
if asin.present? && WarehouseBookVersion.where {(asin == my{asin}) & (tld == my{tld}) & (id != my{id})}.exists?
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "ISBN13: #{isbn13} resolved to duplicate ASIN: #{asin}"}
-
1
existing_warehouse_book_version = WarehouseBookVersion.find_by(asin: asin, tld: tld)
-
1
return if resolve_duplicate_key(existing_warehouse_book_version, :asin) # destroys this book versions if resolve_duplicate_key returns true
-
8
elsif isbn13.present? && WarehouseBookVersion.where {(isbn13 == my{isbn13}) & (tld == my{tld}) & (id != my{id})}.exists?
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "ASIN: #{asin} resolved to duplicate ISBN13: #{isbn13}"}
-
1
existing_warehouse_book_version = WarehouseBookVersion.find_by(isbn13: isbn13, tld: tld)
-
1
return if resolve_duplicate_key(existing_warehouse_book_version, :isbn13) # destroys this book versions if resolve_duplicate_key returns true
-
end
-
# Lookup iTunes id from API for this book if its a Kindle Edition
-
4
Rails.logger.tagged('book_data') {Rails.logger.info "WarehouseBookVersion #{id} with isbn_or_asin #{isbn_or_asin} successfully ingested"}
-
else
-
# Resolve duplicate keys for invalid books
-
14
if asin.present? && WarehouseBookVersion.where {(asin == my{asin}) & (tld == my{tld}) & (id != my{id})}.exists?
-
2
self.duplicate_key = self.asin
-
2
self.asin = nil
-
8
elsif isbn13.present? && WarehouseBookVersion.where {(isbn13 == my{isbn13}) & (tld == my{tld}) & (id != my{id})}.exists?
-
2
self.duplicate_key = self.isbn13
-
2
self.isbn13 = nil
-
end
-
end
-
# Saving the status on the book_version
-
8
save!
-
-
# Ensure the book was ingested correctly before queuing extra data
-
8
if status == :ingested
-
2
BookVersionWorkers::GetItunesMetadata.perform_async id if isbn13.present? && book_format.include?("Kindle")
-
2
BookVersionWorkers::PopulateWebData.perform_async id, asin, tld, author_name
-
2
BookVersionWorkers::PopulateCanonicalUrls.perform_async id
-
end
-
end
-
end
-
-
1
def populate_attributes_from_amazon_api(validate_asin = true)
-
23
response_item = nil
-
23
count = 0
-
-
23
loop do
-
34
response_item = amazon_api_response.matching_response_item
-
34
break if response_item.present?
-
-
12
count = count+1
-
12
sleep(3)
-
12
raise 'No Matching Response Items Found' if count > 10
-
end
-
-
22
return response_item if response_item.class == Symbol
-
-
# set keys
-
19
if self.asin.blank?
-
16
self.asin = response_item.asin
-
elsif validate_asin && asin != response_item.asin
-
2
return :misassigned_asin
-
elsif !validate_asin && asin != response_item.asin
-
1
self.asin = response_item.asin
-
end
-
-
17
self.isbn13 = response_item.isbn13 if isbn13.blank?
-
-
17
if self.isbn13.present? && !ISBN_Tools.is_valid_isbn13?(self.isbn13)
-
15
if WarehouseBookVersion.where {(isbn13 == my{isbn13}) & (tld == my{tld}) & (id != my{id})}.exists?
-
1
self.duplicate_key = self.isbn13
-
1
self.isbn13 = nil
-
end
-
3
return :invalid_key
-
end
-
-
# Assign book type (hardcover, paperback, etc) and title
-
14
self.book_format = response_item.get_binding
-
14
if book_format.blank?
-
4
Rails.logger.tagged('book_data') {Rails.logger.info "no format, populate_attributes_from_amazon failure for book version: #{isbn_or_asin}"}
-
2
return :no_format
-
end
-
# return first author, if author doesn't exist get creator (editor, etc)
-
12
author_name = response_item.author_name.try(:first, 255)
-
12
if author_name.blank?
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "no author, populate_attributes_from_amazon failure for book version: #{isbn_or_asin}"}
-
1
return :no_author
-
end
-
-
11
self.pages = response_item.number_of_pages
-
11
self.publisher = response_item.publisher
-
11
self.physical_details = response_item.physical_details
-
11
BookVersionWorkers::DownloadImage.perform_async id, response_item.medium_image_url if response_item.medium_image_url.present?
-
11
self.pub_date = response_item.publication_date
-
-
11
self.author_name = author_name
-
11
self.title = response_item.title
-
-
11
:ingested
-
end
-
-
1
def similar_warehouse_book_version_ids
-
frequencies = BookVersionCategory.where(category_name: book_version_categories.collect(&:category_name)).where.not(warehouse_book_version_id: id).each_with_object(Hash.new(0)) do |element, hash|
-
hash[element.warehouse_book_version_id] += 1
-
end
-
frequencies.select {|_, count| count > 2}.keys.uniq.presence || frequencies.select {|_, count| count > 1}.keys.uniq
-
end
-
-
1
def similar_categories
-
category_prefix = book_format.include?('Kindle') ? 'Kindle%' : 'Book%'
-
book_version_frequencies = BookVersionCategory.where('category_name LIKE ?', category_prefix).where(category_name: book_version_categories.collect(&:category_name)).where.not(warehouse_book_version_id: id).each_with_object(Hash.new(0)) do |element, hash|
-
hash[element.warehouse_book_version_id] += 1
-
end
-
-
ids = (2..(book_version_frequencies.values.max || 0)).to_a.reverse.collect do |num|
-
book_version_frequencies.select {|_, count| count == num}.keys.uniq.presence
-
end.flatten.first(50)
-
-
BookVersionCategory.where(warehouse_book_version_id: ids).where('category_name LIKE ?', category_prefix).where.not(category_name: book_version_categories.collect(&:category_name)).each_with_object(Hash.new(0)) do |element, hash|
-
hash[element.category_name] += 1
-
end.sort {|x, y| y[1] <=> x[1]}.first(25).collect {|x, _| x}
-
end
-
end
-
1
class WarehouseCategory < ActiveRecord::Base
-
# Constants
-
-
1
STATUSES = %w[canonical alternative deleted].freeze
-
-
# Attributes
-
-
1
attr_accessible :category_id, :depth, :name, :category_type, :tx_book_category_id, :status, :tld, :parent_id
-
-
# Associations
-
-
1
has_many :warehouse_stats, inverse_of: :warehouse_category
-
1
has_many :warehouse_list_stats, inverse_of: :warehouse_category
-
1
belongs_to :parent, class_name: 'WarehouseCategory', inverse_of: :children
-
1
has_many :children, class_name: 'WarehouseCategory', foreign_key: 'parent_id', inverse_of: :parent
-
1
belongs_to :canonical_category, class_name: 'WarehouseCategory', inverse_of: :alternative_categories
-
# Currently grabs alternative and deleted categories, despite association name. Couldn't think of a good name to encompass both
-
1
has_many :alternative_categories, class_name: 'WarehouseCategory', foreign_key: 'canonical_category_id', inverse_of: :canonical_category
-
1
has_many :book_version_categories, inverse_of: :warehouse_category
-
1
has_many :warehouse_book_versions, through: :book_version_categories
-
1
has_many :category_stats, inverse_of: :warehouse_category
-
-
# Validations
-
-
1
validates_presence_of :name, :category_type
-
1
validates_inclusion_of :tld, in: Utilities::TLDS
-
-
1
validates_presence_of :status, :tld, if: :amazon_category?
-
1
validates_inclusion_of :status, in: STATUSES + STATUSES.collect(&:to_sym), if: :amazon_category?
-
1
validate :uniqueness_of_canonical_category_id_and_name, if: :amazon_category?
-
1
validate :uniqueness_of_name_per_category_id_and_tld, if: :amazon_category?
-
-
# Scopes
-
-
719
scope :canonical, lambda {where(status: 'canonical')}
-
5
scope :alternative, lambda {where(status: 'alternative')}
-
1
scope :active, lambda {where{(status == 'canonical') | (status == 'alternative')}}
-
1
scope :deleted, lambda {where(status: 'deleted')}
-
15
scope :com, lambda {where(tld: '.com')}
-
1
scope :couk, lambda {where(tld: '.co.uk')}
-
391
scope :amazon, lambda {where(category_type: 'AmazonBookCategory')}
-
1
scope :barnes_and_noble, lambda {where(category_type: 'BarnesNobleBookCategory')}
-
2
scope :apple, lambda {where(category_type: 'AppleBookCategory')}
-
-
# Miscellaneous
-
-
1
def uniqueness_of_canonical_category_id_and_name
-
140
unless status.blank?
-
140
if category_id.present?
-
140
%w[category_id name].each do |attribute|
-
692
if status.to_sym == :canonical && WarehouseCategory.amazon.canonical.where(:"#{attribute}" => send(attribute), tld: tld).where{id != my{id}}.exists?
-
2
errors.add :"#{attribute}", 'has already been taken'
-
end
-
end
-
end
-
end
-
end
-
-
1
def uniqueness_of_name_per_category_id_and_tld
-
420
errors.add :name, 'has already been taken for this category id and tld' if WarehouseCategory.amazon.where(name: name, category_id: category_id, tld: tld).where{id != my{id}}.exists?
-
end
-
-
# Adds in canonical?, alternative?, and deleted?
-
1
def method_missing(method_name, *args, &block)
-
5
method_name =~ /^(#{STATUSES.join('|')})\?$/ ? status == $1 : super
-
end
-
-
1
def canonical?
-
18
status == 'canonical'
-
end
-
-
1
def amazon_category?
-
786
category_type == 'AmazonBookCategory'
-
end
-
-
1
def bn_physical_category?
-
3
category_type == 'BarnesNobleBookCategory' && category_id.starts_with?('1')
-
end
-
-
1
def leaf_name
-
1
name.split('>').last.strip
-
end
-
-
1
def set_parent_id
-
34
self.parent_id = WarehouseCategory.get_parent_id_for_category_name name, tld
-
end
-
-
1
def self.get_parent_id_for_category_name(category_name, tld)
-
39
if category_name.count('>') > 0
-
38
parents = where name: category_name.split(/ > /)[0..-2].join(' > '), tld: tld
-
57
parent = parents.select {|category| category.canonical?}.first || parents.select {|category| category.status == 'alternative'}.first || parents.first
-
38
parent.id if parent.present?
-
end
-
end
-
-
1
def related_categories
-
1
WarehouseCategory.related_categories self
-
end
-
-
1
def self.related_categories(categories)
-
4
categories_array = Array.wrap categories
-
4
all_category_ids = (categories_array.collect(&:id) + categories_array.collect(&:parent_id).compact).uniq
-
4
canonical_category_id_by_alternative_parent_id = WarehouseCategory.alternative.com.where(parent_id: all_category_ids).value_of(:parent_id, :canonical_category_id).each_with_object({}) do |values, hash|
-
2
hash[values[0]] ||= []
-
2
hash[values[0]] << values[1]
-
end
-
10
canonical_category_ids = all_category_ids.collect {|category_id| canonical_category_id_by_alternative_parent_id[category_id]}.flatten.compact.uniq
-
-
4
(WarehouseCategory.canonical.where(parent_id: all_category_ids) + WarehouseCategory.find(canonical_category_ids)).uniq
-
end
-
-
1
def average_daily_top100(column, warehouse_date)
-
WarehouseListStat.where(warehouse_category_id: id, warehouse_date_id: warehouse_date.id).average(column)
-
end
-
-
1
def average_daily(column, warehouse_date)
-
WarehouseStat.where(warehouse_book_version_id: warehouse_book_version_ids, warehouse_date_id: warehouse_date.id).average(column)
-
end
-
end
-
1
class WarehouseDate < ActiveRecord::Base
-
1
attr_accessible :date
-
-
# Associations
-
1
has_many :warehouse_stats, inverse_of: :warehouse_date
-
1
has_many :warehouse_list_stats, inverse_of: :warehouse_date
-
1
has_many :book_version_exceptions, inverse_of: :warehouse_date
-
1
has_many :warehouse_categories, inverse_of: :warehouse_date
-
-
# Validations
-
1
validates_presence_of :date
-
-
1
def self.current
-
find_by(date: Date.current)
-
end
-
end
-
1
class WarehouseListStat < ActiveRecord::Base
-
1
attr_accessible :warehouse_book_version, :warehouse_book_version_id, :warehouse_category, :warehouse_category_id, :warehouse_date, :warehouse_date_id, :warehouse_trend, :warehouse_trend_id, :days_in_top_100, :name, :rank, :price, :author, :title, :asin, :isbn, :bn_id, :itunes_id, :list_type
-
-
# Associations
-
1
belongs_to :warehouse_category, inverse_of: :warehouse_list_stats
-
1
belongs_to :warehouse_book_version, inverse_of: :warehouse_list_stats
-
1
belongs_to :warehouse_trend, inverse_of: :warehouse_list_stats
-
1
belongs_to :warehouse_date, inverse_of: :warehouse_list_stats
-
-
# Validations
-
1
validates_presence_of :warehouse_category_id, :warehouse_date_id, :rank
-
end
-
1
class WarehouseRegion < ActiveRecord::Base
-
1
has_many :warehouse_stats, inverse_of: :warehouse_region
-
1
has_many :book_version_categories, inverse_of: :warehouse_region
-
1
has_many :warehouse_categories, inverse_of: :warehouse_region
-
-
1
attr_accessible :tld
-
-
# Validations
-
1
validates_presence_of :tld
-
-
# Miscellaneous
-
-
1
def self.com
-
22
find_by tld: '.com'
-
end
-
-
1
def self.couk
-
4
find_by tld: '.co.uk'
-
end
-
end
-
1
class WarehouseStat < ActiveRecord::Base
-
1
belongs_to :warehouse_book_version, inverse_of: :warehouse_stats
-
1
belongs_to :warehouse_date, inverse_of: :warehouse_stats
-
1
belongs_to :warehouse_region, inverse_of: :warehouse_stats
-
-
# main sales rank + 3 sub category rankings
-
1
belongs_to :warehouse_amazon_sales_rank_category, :class_name => 'WarehouseCategory', :foreign_key => 'warehouse_amazon_sales_rank_category_id', inverse_of: :warehouse_stats
-
1
belongs_to :warehouse_amazon_category1, :class_name => 'WarehouseCategory', :foreign_key => 'warehouse_amazon_category1_id', inverse_of: :warehouse_stats
-
1
belongs_to :warehouse_amazon_category2, :class_name => 'WarehouseCategory', :foreign_key => 'warehouse_amazon_category2_id', inverse_of: :warehouse_stats
-
1
belongs_to :warehouse_amazon_category3, :class_name => 'WarehouseCategory', :foreign_key => 'warehouse_amazon_category3_id', inverse_of: :warehouse_stats
-
-
1
belongs_to :sub_category1_author_rank_id, :class_name => 'WarehouseCategory', :foreign_key => 'sub_category1_author_rank_id'
-
1
belongs_to :sub_category2_author_rank_id, :class_name => 'WarehouseCategory', :foreign_key => 'sub_category2_author_rank_id'
-
1
belongs_to :sub_category3_author_rank_id, :class_name => 'WarehouseCategory', :foreign_key => 'sub_category3_author_rank_id'
-
1
belongs_to :sub_category4_author_rank_id, :class_name => 'WarehouseCategory', :foreign_key => 'sub_category4_author_rank_id'
-
-
1
attr_accessible :warehouse_book_version, :warehouse_date, :warehouse_region,
-
:warehouse_region_id, :warehouse_amazon_category1_id, :warehouse_amazon_category2_id,
-
:warehouse_amazon_category3_id, :amazon_aus_price, :amazon_average_rating, :amazon_digital_list_price,
-
:amazon_euro_price, :amazon_likes, :amazon_list_price, :amazon_price, :amazon_review_count, :amazon_sales_rank,
-
:warehouse_amazon_sales_rank_category_id, :bn_average_rating, :bn_list_price, :bn_nook_list_price, :bn_nook_price,
-
:bn_price, :bn_review_count, :bn_sales_rank, :amazon_category1_rank, :amazon_category2_rank, :amazon_category3_rank,
-
:amazon_days_in_top_100_1, :amazon_days_in_top_100_2, :amazon_days_in_top_100_3, :amazon_top_100_trend1,
-
:amazon_top_100_trend2, :amazon_top_100_trend3, :delicious_count, :digg_count, :fb_click_count, :fb_comment_count,
-
:fb_commentsbox_count, :fb_like_count, :fb_share_count, :google_plus_count, :itunes_average_rating,
-
:itunes_price, :itunes_rating_count, :linkedin_count, :pinterest_count, :reddit_count, :stumbleupon_count,
-
:twitter_count, :five_star_count, :four_star_count, :three_star_count, :two_star_count, :one_star_count, :tx_book_version_stat_id,
-
:overall_author_rank, :sub_category1_author_rank, :sub_category1_author_rank_id, :sub_category2_author_rank,
-
:sub_category2_author_rank_id, :sub_category3_author_rank, :sub_category3_author_rank_id, :sub_category4_author_rank,
-
:sub_category4_author_rank_id, :amazon_availability, :kindle_unlimited
-
-
# Delegations
-
1
delegate :date, to: :warehouse_date
-
1
delegate :tld, :asin, :isbn13, to: :warehouse_book_version
-
-
# Validations
-
1
validates_presence_of :warehouse_book_version
-
1
validates_presence_of :warehouse_date
-
1
validates_presence_of :warehouse_region
-
-
# Constants
-
1
WAREHOUSE_STAT_FIELDS = %w[warehouse_amazon_category1_id amazon_category1_rank warehouse_amazon_category2_id amazon_category2_rank warehouse_amazon_category3_id amazon_category3_rank tx_book_version_stat_id warehouse_date_id warehouse_region_id warehouse_book_version_id warehouse_amazon_sales_rank_category_id amazon_sales_rank bn_sales_rank amazon_likes amazon_list_price amazon_price amazon_digital_list_price bn_nook_price bn_nook_list_price amazon_euro_price amazon_aus_price bn_price bn_list_price itunes_price itunes_average_rating itunes_rating_count stumbleupon_count reddit_count fb_commentsbox_count fb_click_count fb_comment_count fb_like_count fb_share_count delicious_count google_plus_count twitter_count digg_count pinterest_count linkedin_count amazon_average_rating amazon_review_count bn_average_rating bn_review_count five_star_count four_star_count three_star_count two_star_count one_star_count overall_author_rank sub_category1_author_rank sub_category1_author_rank_id sub_category2_author_rank sub_category2_author_rank_id sub_category3_author_rank sub_category3_author_rank_id sub_category4_author_rank sub_category4_author_rank_id itunes_gb_average_rating itunes_gb_rating_count itunes_gb_price itunes_au_average_rating itunes_au_rating_count itunes_au_price amazon_availability kindle_unlimited].freeze
-
1
WAREHOUSE_AMAZON_ALSO_BOUGHT_FIELDS = %w[amazon_also_bought_title_1 amazon_also_bought_asin_1 amazon_also_bought_author_1 amazon_also_bought_price_1 amazon_also_bought_rating_1 amazon_also_bought_star_count_1 amazon_also_bought_title_2 amazon_also_bought_asin_2 amazon_also_bought_author_2 amazon_also_bought_price_2 amazon_also_bought_rating_2 amazon_also_bought_star_count_2 amazon_also_bought_title_3 amazon_also_bought_asin_3 amazon_also_bought_author_3 amazon_also_bought_price_3 amazon_also_bought_rating_3 amazon_also_bought_star_count_3 amazon_also_bought_title_4 amazon_also_bought_asin_4 amazon_also_bought_author_4 amazon_also_bought_price_4 amazon_also_bought_rating_4 amazon_also_bought_star_count_4 amazon_also_bought_title_5 amazon_also_bought_asin_5 amazon_also_bought_author_5 amazon_also_bought_price_5 amazon_also_bought_rating_5 amazon_also_bought_star_count_5 amazon_also_bought_title_6 amazon_also_bought_asin_6 amazon_also_bought_author_6 amazon_also_bought_price_6 amazon_also_bought_rating_6 amazon_also_bought_star_count_6].freeze
-
1
WAREHOUSE_AMAZON_BOUGHT_AFTER_VIEWING_FIELDS = %w[amazon_bought_after_viewing_title_1 amazon_bought_after_viewing_asin_1 amazon_bought_after_viewing_author_1 amazon_bought_after_viewing_rating_1 amazon_bought_after_viewing_star_count_1 amazon_bought_after_viewing_price_1 amazon_bought_after_viewing_title_2 amazon_bought_after_viewing_asin_2 amazon_bought_after_viewing_author_2 amazon_bought_after_viewing_rating_2 amazon_bought_after_viewing_star_count_2 amazon_bought_after_viewing_price_2 amazon_bought_after_viewing_title_3 amazon_bought_after_viewing_asin_3 amazon_bought_after_viewing_author_3 amazon_bought_after_viewing_rating_3 amazon_bought_after_viewing_star_count_3 amazon_bought_after_viewing_price_3 amazon_bought_after_viewing_title_4 amazon_bought_after_viewing_asin_4 amazon_bought_after_viewing_author_4 amazon_bought_after_viewing_rating_4 amazon_bought_after_viewing_star_count_4 amazon_bought_after_viewing_price_4].freeze
-
1
WAREHOUSE_AMAZON_FREQUENTLY_BOUGHT_TOGETHER_FIELDS = %w[amazon_frequently_bought_together_format_1 amazon_frequently_bought_together_price_1 amazon_frequently_bought_together_title_1 amazon_frequently_bought_together_format_2 amazon_frequently_bought_together_price_2 amazon_frequently_bought_together_title_2].freeze
-
16
WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS = Array.new(15) {|num| "amazon_similar_item_category_tree_#{num + 1}"}.freeze
-
16
WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_EXTERNAL_ID_FIELDS = Array.new(15) {|num| "amazon_similar_item_category_external_id_#{num + 1}"}.freeze
-
1
WAREHOUSE_AMAZON_RELATED_FORMAT_FIELDS = %w[related_formats_kindle_price related_formats_kindle_asin related_formats_mass_market_paperback_price related_formats_mass_market_paperback_asin related_formats_nook_price related_formats_nook_ean related_formats_hardcover_price related_formats_hardcover_asin related_formats_paperback_price related_formats_paperback_asin].freeze
-
17
WAREHOUSE_AMAZON_ALSO_BOUGHT_ITEMS_BY_FIELDS = Array.new(16) {|num| "amazon_also_bought_items_by_#{num + 1}"}.freeze
-
1
WAREHOUSE_AMAZON_AUTHOR_RANK_FIELDS = %w[overall_author_rank sub_category1_author_rank sub_category2_author_rank sub_category3_author_rank sub_category4_author_rank sub_category1_author_rank_id sub_category2_author_rank_id sub_category3_author_rank_id sub_category4_author_rank_id].freeze
-
1
WAREHOUSE_GOODREADS_FIELDS = %w[goodreads_work_average_rating goodreads_work_rating_count goodreads_work_review_count goodreads_work_added_by_count goodreads_work_to_read_count goodreads_edition_average_rating goodreads_edition_rating_count goodreads_edition_review_count goodreads_edition_added_by_count goodreads_5_star_count goodreads_4_star_count goodreads_3_star_count goodreads_2_star_count goodreads_1_star_count].freeze
-
1
WAREHOUSE_BN_ALSO_BOUGHT_FIELDS = %w[bn_also_bought_title_1 bn_also_bought_ean_1 bn_also_bought_author_1 bn_also_bought_price_1 bn_also_bought_title_2 bn_also_bought_ean_2 bn_also_bought_author_2 bn_also_bought_price_2 bn_also_bought_title_3 bn_also_bought_ean_3 bn_also_bought_author_3 bn_also_bought_price_3 bn_also_bought_title_4 bn_also_bought_ean_4 bn_also_bought_author_4 bn_also_bought_price_4 bn_also_bought_title_5 bn_also_bought_ean_5 bn_also_bought_author_5 bn_also_bought_price_5 bn_also_bought_title_6 bn_also_bought_ean_6 bn_also_bought_author_6 bn_also_bought_price_6].freeze
-
-
# Miscellaneous
-
-
1
def amazon_similar_item_category_names
-
112
WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS.collect {|key| send key}.compact
-
end
-
-
1
def amazon_similar_item_category_external_ids
-
32
WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_EXTERNAL_ID_FIELDS.collect {|key| send key}.compact
-
end
-
-
# Returns all similar item categories, including the canonical categories for any alternative category or deleted category
-
1
def all_similar_item_categories
-
4
categories = WarehouseCategory.com.where(name: amazon_similar_item_category_names).to_a
-
-
4
categories + categories.collect(&:canonical_category).compact
-
end
-
-
# Returns categories related to the categories in amazon_similar_item_category_tree_X fields
-
1
def related_categories
-
2
categories = if warehouse_book_version.book_format.include?('Kindle')
-
3
all_similar_item_categories.select {|category| category.name.start_with?('Kindle Store')}
-
else
-
3
all_similar_item_categories.reject {|category| category.name.start_with?('Kindle Store')}
-
end
-
-
2
WarehouseCategory.related_categories categories
-
end
-
-
1
def extended_related_categories
-
also_bought_asins = (1..6).collect {|num| send "amazon_also_bought_asin_#{num}"}.compact
-
bought_after_viewing_asins = (1..4).collect {|num| send "amazon_bought_after_viewing_asin_#{num}"}.compact
-
book_version_ids = WarehouseBookVersion.where(asin: (also_bought_asins + bought_after_viewing_asins).uniq, tld: warehouse_region.tld).value_of(:id)
-
stats = WarehouseStat.where(warehouse_book_version_id: book_version_ids, warehouse_date_id: warehouse_date.id)
-
stats.collect(&:related_categories).flatten.uniq
-
end
-
-
1
def get_top100_rank_projections(strategy)
-
categories = case strategy
-
when :related_categories
-
related_categories
-
when :extended_related_categories
-
extended_related_categories
-
when :similar_books
-
category_names = BookVersionCategory.where(warehouse_book_version_id: warehouse_book_version.similar_warehouse_book_version_ids).value_of :category_name
-
(WarehouseCategory.amazon.com.canonical.where(name: category_names) + WarehouseCategory.amazon.com.alternative.where(name: category_names).collect(&:canonical_category)).uniq.compact
-
when :similar_categories
-
(WarehouseCategory.amazon.com.canonical.where(name: warehouse_book_version.similar_categories) + WarehouseCategory.amazon.com.alternative.where(name: warehouse_book_version.similar_categories).collect(&:canonical_category)).uniq.compact
-
end
-
details = categories.each_with_object({}) do |related_category, hash|
-
hash[related_category.id] = {currently_ranked: false, position: 101, next_sales_rank: 0, prev_sales_rank: 0, name: related_category.name}
-
end
-
list_stats = WarehouseListStat.where(warehouse_category_id: details.keys, warehouse_date_id: warehouse_date_id)
-
stats_by_book_version_id = WarehouseStat.where(warehouse_book_version_id: list_stats.collect(&:warehouse_book_version_id).compact, warehouse_date_id: warehouse_date_id).each_with_object({}.with_indifferent_access) do |warehouse_stat, hash|
-
hash[warehouse_stat.warehouse_book_version_id] = warehouse_stat
-
end
-
list_stats.each do |list_stat|
-
details[list_stat.warehouse_category_id][:currently_ranked] = true if list_stat.asin == warehouse_book_version.asin
-
next if list_stat.warehouse_book_version_id.blank? || amazon_sales_rank.blank?
-
if stats_by_book_version_id[list_stat.warehouse_book_version_id].present? && stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank.present?
-
if amazon_sales_rank <= stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank && details[list_stat.warehouse_category_id][:position] > list_stat.rank
-
details[list_stat.warehouse_category_id][:position] = list_stat.rank
-
details[list_stat.warehouse_category_id][:prev_sales_rank] = stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank
-
elsif amazon_sales_rank > stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank && details[list_stat.warehouse_category_id][:next_sales_rank] < stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank
-
details[list_stat.warehouse_category_id][:next_sales_rank] = stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank
-
end
-
-
if details[list_stat.warehouse_category_id][:best_sales_rank].blank? || stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank < details[list_stat.warehouse_category_id][:best_sales_rank]
-
details[list_stat.warehouse_category_id][:best_sales_rank] = stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank
-
end
-
-
if details[list_stat.warehouse_category_id][:worst_sales_rank].blank? || stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank > details[list_stat.warehouse_category_id][:worst_sales_rank]
-
details[list_stat.warehouse_category_id][:worst_sales_rank] = stats_by_book_version_id[list_stat.warehouse_book_version_id].amazon_sales_rank
-
end
-
end
-
end
-
-
details
-
end
-
-
1
def self.top100_rank_projections(warehouse_date, stats, extended)
-
canonical_category_id_by_alternative_parent_id = WarehouseCategory.alternative.com.value_of(:parent_id, :canonical_category_id).each_with_object({}) do |values, hash|
-
hash[values[0]] ||= []
-
hash[values[0]] << values[1]
-
end
-
all_similar_item_category_names = stats.collect(&:amazon_similar_item_category_names).flatten.uniq.select {|name| name.start_with?('Kindle Store')}
-
initial_categories = WarehouseCategory.com.where(name: all_similar_item_category_names).to_a
-
base_categories = (initial_categories + initial_categories.collect(&:canonical_category).compact)
-
all_category_ids = (base_categories.collect(&:id) + base_categories.collect(&:parent_id).compact).uniq
-
canonical_category_ids = all_category_ids.collect {|category_id| canonical_category_id_by_alternative_parent_id[category_id]}.flatten.compact.uniq
-
all_categories = (WarehouseCategory.canonical.where(parent_id: all_category_ids) + WarehouseCategory.find(canonical_category_ids)).flatten.uniq
-
-
list_stat_values_by_category_id = WarehouseListStat.where(warehouse_category_id: all_categories.collect(&:id), warehouse_date_id: warehouse_date.id).order(:warehouse_category_id).value_of(:warehouse_book_version_id, :warehouse_category_id, :rank).chunk {|_, warehouse_category_id, _| warehouse_category_id}.each_with_object({}) do |values, hash|
-
hash[values[0]] = values[1]
-
end
-
warehouse_book_version_ids = list_stat_values_by_category_id.collect {|_, values| values.collect(&:first)}.flatten.uniq
-
amazon_sales_rank_by_book_version_id = WarehouseStat.where(warehouse_book_version_id: warehouse_book_version_ids, warehouse_date_id: warehouse_date.id).value_of(:warehouse_book_version_id, :amazon_sales_rank).each_with_object({}.with_indifferent_access) do |warehouse_stat_values, hash|
-
hash[warehouse_stat_values[0]] = warehouse_stat_values[1]
-
end
-
-
stats.each_with_object({}.with_indifferent_access) do |stat, hash|
-
my_category_names = stat.amazon_similar_item_category_names
-
my_filtered_categories = base_categories.select {|category| my_category_names.include? category.name}
-
my_category_ids = my_filtered_categories.collect(&:id) + my_filtered_categories.collect(&:parent_id).compact
-
my_canonical_category_ids = my_category_ids.collect {|category_id| canonical_category_id_by_alternative_parent_id[category_id]}.flatten.compact.uniq
-
my_categories = all_categories.select {|category| my_category_ids.include?(category.parent_id) || my_canonical_category_ids.include?(category.id)}
-
-
details = my_categories.each_with_object({}.with_indifferent_access) do |related_category, details_hash|
-
details_hash[related_category.id] = {currently_ranked: false, position: 101, next_sales_rank: 0, prev_sales_rank: 0, name: related_category.name}
-
end
-
my_categories.collect(&:id).each do |my_category_id|
-
list_stat_values_by_category_id[my_category_id].each do |warehouse_book_version_id, warehouse_category_id, rank|
-
next unless warehouse_book_version_id.present? && amazon_sales_rank_by_book_version_id[warehouse_book_version_id].present? && stat.amazon_sales_rank.present?
-
-
details[warehouse_category_id][:currently_ranked] = true if stat.warehouse_book_version_id == warehouse_book_version_id
-
if stat.amazon_sales_rank <= amazon_sales_rank_by_book_version_id[warehouse_book_version_id] && details[warehouse_category_id][:position] > rank
-
details[warehouse_category_id][:position] = rank
-
details[warehouse_category_id][:prev_sales_rank] = amazon_sales_rank_by_book_version_id[warehouse_book_version_id]
-
elsif stat.amazon_sales_rank > amazon_sales_rank_by_book_version_id[warehouse_book_version_id] && details[warehouse_category_id][:next_sales_rank] < amazon_sales_rank_by_book_version_id[warehouse_book_version_id]
-
details[warehouse_category_id][:next_sales_rank] = amazon_sales_rank_by_book_version_id[warehouse_book_version_id]
-
end
-
-
if details[warehouse_category_id][:best_sales_rank].blank? || amazon_sales_rank_by_book_version_id[warehouse_book_version_id] < details[warehouse_category_id][:best_sales_rank]
-
details[warehouse_category_id][:best_sales_rank] = amazon_sales_rank_by_book_version_id[warehouse_book_version_id]
-
end
-
-
if details[warehouse_category_id][:worst_sales_rank].blank? || amazon_sales_rank_by_book_version_id[warehouse_book_version_id] > details[warehouse_category_id][:worst_sales_rank]
-
details[warehouse_category_id][:worst_sales_rank] = amazon_sales_rank_by_book_version_id[warehouse_book_version_id]
-
end
-
end if list_stat_values_by_category_id[my_category_id].present?
-
end if my_categories.present?
-
-
hash[stat.warehouse_book_version_id] = details
-
end
-
end
-
end
-
1
class WarehouseTrend < ActiveRecord::Base
-
1
attr_accessible :name
-
-
# Associations
-
1
has_many :warehouse_list_stats, inverse_of: :warehouse_trend
-
-
# Validations
-
1
validates_presence_of :name
-
end
-
1
module AmazonApi
-
1
extend self
-
-
1
def get_all_items_by_keys_and_tld(keys, key_type, tld)
-
14
keys = Array.wrap(keys).uniq
-
14
items_by_key = {}
-
-
14
if key_type.to_sym != :asin && key_type.to_sym != :isbn13
-
2
return keys.each_with_object({}) {|key, hash| hash[key] = {lookup: [], search: [], status: :invalid_key_type}}
-
end
-
-
13
items_by_key = keys.each_with_object({}) do |key, hash|
-
17
hash[key] = {lookup: [], search: [], status: :ready_for_amazon_ingestion}
-
17
hash[key][:status] = :invalid_key if key.blank? || (key_type.to_sym == :asin && key.length != 10) || (key_type.to_sym == :isbn13 && key.length != 13)
-
end
-
-
30
lookup_key_hashes = items_by_key.select {|key, hash| hash[:status] == :ready_for_amazon_ingestion}
-
-
13
return items_by_key unless lookup_key_hashes.present?
-
-
10
Amazon::Ecs.options = AmazeBot.config[:amazon][:amazon_ecs][:options]
-
10
item_lookup_res = Amazon::Ecs.item_lookup(lookup_key_hashes.keys.join(','), AmazeBot.config[:amazon][:amazon_ecs][:item_lookup_options].merge({country: convert_tld_to_country(tld)}))
-
8
match_response_items_by_key!(lookup_key_hashes, item_lookup_res, key_type, :lookup)
-
8
items_by_key.merge lookup_key_hashes
-
-
16
search_key_hashes = lookup_key_hashes.select {|key, hash| hash[:lookup].length != 1}
-
8
if search_key_hashes.present?
-
5
item_search_res = Amazon::Ecs.item_search(search_key_hashes.keys.join(' | '), AmazeBot.config[:amazon][:amazon_ecs][:item_search_options].merge({country: convert_tld_to_country(tld)}))
-
5
match_response_items_by_key!(search_key_hashes, item_search_res, key_type, :search)
-
-
5
search_key_hashes.each do |key, hash|
-
5
if hash[:search].length > 1 || (hash[:search].length == 0 && hash[:lookup].length > 1)
-
4
Rails.logger.tagged('book_data') {Rails.logger.info "item_search returned more than 1 result or item_lookup returned more than one result (with no results from item_search) so we wont ingest: #{key}"}
-
2
hash[:status] = :ambiguous_results
-
elsif hash[:search].length == 1
-
4
Rails.logger.tagged('book_data') {Rails.logger.info "item_lookup returned no results (or more than 1 result) but item_search returned only 1 result so we will ingest: #{key}"}
-
else
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "item_lookup returned no results and item_search returned no results so this book doesn't exist: #{key}"}
-
1
hash[:status] = :no_results
-
end
-
end
-
-
5
items_by_key.merge search_key_hashes
-
end
-
-
8
items_by_key
-
rescue *HTTP_ERRORS => e
-
2
if e.message.include? "503"
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "The amazon api response request for IBSN #{keys.join(", ")} was throttled #{e.message}"}
-
2
items_by_key.each {|key, hash| hash[:status] = :throttled}
-
1
return items_by_key
-
end
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "The amazon api response request for IBSN #{keys.join(", ")} returned an unknown error: #{e.message}"}
-
2
items_by_key.each {|key, hash| hash[:status] = :external_error}
-
1
return items_by_key
-
end
-
-
1
def get_asin_from_item(item)
-
11
item.get("ASIN")
-
end
-
-
1
def get_author_from_item(item)
-
4
get_decoder.decode item.get("ItemAttributes/Author")
-
end
-
-
1
def get_binding_from_item(item)
-
4
get_decoder.decode item.get("ItemAttributes/Binding")
-
end
-
-
1
def get_brand_from_item(item)
-
4
get_decoder.decode item.get("ItemAttributes/Brand")
-
end
-
-
1
def get_creator_from_item(item)
-
4
get_decoder.decode item.get("ItemAttributes/Creator")
-
end
-
-
1
def get_ean_from_item(item)
-
7
get_decoder.decode item.get("ItemAttributes/EAN")
-
end
-
-
1
def get_ean_list_element_from_item(item)
-
1
get_decoder.decode item.get("ItemAttributes/EANList/EANListElement")
-
end
-
-
1
def get_ean_list_elements_from_item(item)
-
12
item.get_array("ItemAttributes/EANList/EANListElement").map {|val| get_decoder.decode val}
-
end
-
-
1
def get_eisbn_from_item(item)
-
7
get_decoder.decode item.get("ItemAttributes/EISBN")
-
end
-
-
1
def get_isbn_from_item(item)
-
4
get_decoder.decode item.get("ItemAttributes/ISBN")
-
end
-
-
1
def get_item_dimensions_height_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/ItemDimensions/Height")).to_i if item.get("ItemAttributes/ItemDimensions/Height").present?
-
end
-
-
1
def get_item_dimensions_height_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/ItemDimensions/Height").attributes["Units"].value if item.get_element("ItemAttributes/ItemDimensions/Height").present?
-
end
-
-
1
def get_item_dimensions_length_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/ItemDimensions/Length")).to_i if item.get("ItemAttributes/ItemDimensions/Length").present?
-
end
-
-
1
def get_item_dimensions_length_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/ItemDimensions/Length").attributes["Units"].value if item.get_element("ItemAttributes/ItemDimensions/Length").present?
-
end
-
-
1
def get_item_dimensions_weight_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/ItemDimensions/Weight")).to_i if item.get("ItemAttributes/ItemDimensions/Weight").present?
-
end
-
-
1
def get_item_dimensions_weight_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/ItemDimensions/Weight").attributes["Units"].value if item.get_element("ItemAttributes/ItemDimensions/Weight").present?
-
end
-
-
1
def get_item_dimensions_width_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/ItemDimensions/Width")).to_i if item.get("ItemAttributes/ItemDimensions/Width").present?
-
end
-
-
1
def get_item_dimensions_width_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/ItemDimensions/Width").attributes["Units"].value if item.get_element("ItemAttributes/ItemDimensions/Width").present?
-
end
-
-
1
def get_label_from_item(item)
-
5
if item.get("ItemAttributes/Label").present?
-
2
label = item.get("ItemAttributes/Label")
-
2
get_decoder.decode label.first(255)
-
end
-
end
-
-
1
def get_large_image_url_from_item(item)
-
4
URI.decode item.get("LargeImage/URL") if item.get("LargeImage/URL").present?
-
end
-
-
1
def get_medium_image_url_from_item(item)
-
4
URI.decode item.get("MediumImage/URL") if item.get("MediumImage/URL").present?
-
end
-
-
1
def get_small_image_url_from_item(item)
-
4
URI.decode item.get("SmallImage/URL") if item.get("SmallImage/URL").present?
-
end
-
-
1
def get_list_price_amount_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/ListPrice/Amount")).to_i if item.get("ItemAttributes/ListPrice/Amount").present?
-
end
-
-
1
def get_list_price_currency_code_from_item(item)
-
4
get_decoder.decode item.get("ItemAttributes/ListPrice/CurrencyCode")
-
end
-
-
1
def get_manufacturer_from_item(item)
-
5
if item.get("ItemAttributes/Manufacturer").present?
-
2
manufacturer = item.get("ItemAttributes/Manufacturer")
-
2
get_decoder.decode manufacturer.first(255)
-
end
-
end
-
-
1
def get_number_of_pages_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/NumberOfPages")).to_i if item.get("ItemAttributes/NumberOfPages").present?
-
end
-
-
1
def get_package_dimensions_height_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/PackageDimensions/Height")).to_i if item.get("ItemAttributes/PackageDimensions/Height").present?
-
end
-
-
1
def get_package_dimensions_height_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/PackageDimensions/Height").attributes["Units"].value if item.get_element("ItemAttributes/PackageDimensions/Height").present?
-
end
-
-
1
def get_package_dimensions_length_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/PackageDimensions/Length")).to_i if item.get("ItemAttributes/PackageDimensions/Length").present?
-
end
-
-
1
def get_package_dimensions_length_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/PackageDimensions/Length").attributes["Units"].value if item.get_element("ItemAttributes/PackageDimensions/Length").present?
-
end
-
-
1
def get_package_dimensions_weight_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/PackageDimensions/Weight")).to_i if item.get("ItemAttributes/PackageDimensions/Weight").present?
-
end
-
-
1
def get_package_dimensions_weight_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/PackageDimensions/Weight").attributes["Units"].value if item.get_element("ItemAttributes/PackageDimensions/Weight").present?
-
end
-
-
1
def get_package_dimensions_width_from_item(item)
-
4
get_decoder.decode(item.get("ItemAttributes/PackageDimensions/Width")).to_i if item.get("ItemAttributes/PackageDimensions/Width").present?
-
end
-
-
1
def get_package_dimensions_width_unit_from_item(item)
-
4
get_decoder.decode item.get_element("ItemAttributes/PackageDimensions/Width").attributes["Units"].value if item.get_element("ItemAttributes/PackageDimensions/Width").present?
-
end
-
-
1
def get_publication_date_from_item(item)
-
8
date = item.get("ItemAttributes/PublicationDate")
-
8
if date.present?
-
5
if date.include? "-"
-
3
if date.count("-") > 1
-
2
return Date.strptime(date, "%Y-%m-%d")
-
else
-
1
return Date.strptime(date, "%Y-%m")
-
end
-
2
elsif date.strip.scan(/\D/).blank?
-
1
return Date.strptime(date, "%Y")
-
else
-
1
return Date.strptime(date, "%b %Y")
-
end
-
end
-
rescue ArgumentError
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "Amazon API returned malformed date string, can't parse date"}
-
1
nil
-
end
-
-
1
def get_publisher_from_item(item)
-
5
if item.get("ItemAttributes/Publisher").present?
-
2
publisher = item.get("ItemAttributes/Publisher")
-
2
get_decoder.decode publisher.first(255)
-
end
-
end
-
-
1
def get_sales_rank_from_item(item)
-
4
get_decoder.decode item.get("SalesRank")
-
end
-
-
1
def get_studio_from_item(item)
-
5
if item.get("ItemAttributes/Studio").present?
-
2
studio = item.get("ItemAttributes/Studio")
-
2
get_decoder.decode studio.first(255)
-
end
-
end
-
-
1
def get_title_from_item(item)
-
4
get_decoder.decode item.get("ItemAttributes/Title")
-
end
-
-
1
def get_all_isbns(item)
-
3
Array.wrap(AmazonApi.get_eisbn_from_item(item)) + Array.wrap(AmazonApi.get_ean_from_item(item)) + AmazonApi.get_ean_list_elements_from_item(item)
-
end
-
-
1
private
-
-
1
def get_decoder
-
82
@decoder ||= HTMLEntities.new
-
end
-
-
1
def match_response_items_by_key!(key_hashes, response, key_type, api_query_type)
-
13
key_hashes.keys.each do |key|
-
13
response.items.each do |item|
-
9
if key_type.to_sym == :asin
-
7
key_hashes[key][api_query_type] << item if AmazonApi.get_asin_from_item(item) == key
-
else
-
2
key_hashes[key][api_query_type] << item if AmazonApi.get_all_isbns(item).include? key
-
end
-
end
-
end
-
end
-
-
1
def convert_tld_to_country(tld)
-
18
case tld
-
when '.com'
-
16
'us'
-
when '.co.uk'
-
1
'uk'
-
else
-
1
throw 'Bad TLD/Country conversion'
-
end
-
end
-
end
-
1
module DataAnalysis
-
1
asin = 'B008LQ1A68'
-
-
1
def self.all_also_bought_appearance_counts_report(asins)
-
dates = (Date.current - 1.month)..Date.current
-
data = dates.each_with_object({}) do |date, hash|
-
counts = DataAnalysis.all_also_bought_appearance_counts(date)
-
hash[date] = asins.collect {|asin| counts[asin]}
-
end
-
-
csv = CSV.open('./also_bought_appearances.csv', 'wb')
-
csv << ['Date', 'Total Books', 'Total Books With Appearances', 'Percent Books with Appearances', 'Mean Appearance Count', 'Median Appearance Count', 'Mode Appearance Count']
-
dates.each do |date|
-
csv << [date, asins.count, data[date].compact.count, "#{'%.2f' % (data[date].compact.count.to_f / asins.count * 100.0)}%",
-
data[date].compact.mean, data[date].compact.median, data[date].compact.mode]
-
end
-
csv.close
-
end
-
-
1
def self.all_also_bought_appearance_counts(date)
-
warehouse_date_id = WarehouseDate.find_by(date: date).id
-
counts = (0..5).collect do |num|
-
WarehouseStat.where(warehouse_date_id: warehouse_date_id, warehouse_region_id: WarehouseRegion.com.id).group(:"amazon_also_bought_asin_#{num + 1}").count
-
end
-
total_counts = counts[0]
-
(1..5).each do |num|
-
counts[num].each_pair {|asin_key, count| total_counts[asin_key] = total_counts.fetch(asin_key, 0) + count}
-
end
-
-
total_counts
-
end
-
-
1
def self.also_bought_appearances(asin, date)
-
warehouse_date_id = WarehouseDate.find_by(date: date).id
-
-
WarehouseStat.where{(amazon_also_bought_asin_1 == asin) |
-
(amazon_also_bought_asin_2 == asin) |
-
(amazon_also_bought_asin_3 == asin) |
-
(amazon_also_bought_asin_4 == asin) |
-
(amazon_also_bought_asin_5 == asin) |
-
(amazon_also_bought_asin_6 == asin)}.where(warehouse_date_id: warehouse_date_id, warehouse_region_id: WarehouseRegion.com.id).count
-
end
-
-
# dates = (Date.current - 1.week)..Date.current
-
1
def self.dates_entered_top_100_lists_report(asins, start_date, end_date)
-
warehouse_dates_by_id = WarehouseDate.where(date: (start_date - 1.day)..end_date).each_with_object({}) do |warehouse_date, hash|
-
hash[warehouse_date.id] = warehouse_date
-
end
-
warehouse_dates = warehouse_dates_by_id.values
-
data_by_asins = {}
-
warehouse_dates.each do |warehouse_date|
-
WarehouseListStat.where(warehouse_date_id: warehouse_date.id).where(asin: asins).each do |list_stat|
-
data_by_asins[list_stat.asin] ||= {}
-
data_by_asins[list_stat.asin][list_stat.warehouse_category_id] ||= []
-
data_by_asins[list_stat.asin][list_stat.warehouse_category_id] << warehouse_dates_by_id[list_stat.warehouse_date_id].date
-
end
-
end
-
-
warehouse_categories_by_id = WarehouseCategory.where(id: data_by_asins.values.collect(&:keys).flatten.uniq).each_with_object({}) do |category, hash|
-
hash[category.id] = category
-
end
-
-
dates_entered = {}
-
report = warehouse_dates.each_with_object({}) {|warehouse_date, hash| hash[warehouse_date.date] = {}}
-
data_by_asins.each_pair do |asin, date_lists_by_warehouse_category_id|
-
dates_entered[asin] ||= {}
-
date_lists_by_warehouse_category_id.each_pair do |warehouse_category_id, date_list|
-
dates = []
-
warehouse_dates[1..-1].each_with_index do |warehouse_date, index|
-
if date_list.present? && date_list.include?(warehouse_date.date) && date_list.exclude?(warehouse_dates[index].date)
-
dates << warehouse_date.date
-
report[warehouse_date.date][asin] ||= 0
-
report[warehouse_date.date][asin] += 1
-
end
-
end
-
-
dates_entered[asin][warehouse_categories_by_id[warehouse_category_id]] = dates
-
end
-
end
-
-
csv = CSV.open('./date_entered_top_100s.csv', 'wb')
-
csv << ['Date', 'Total Books', 'Total Books Newly Entering Top 100s', 'Percent Books That Newly Entered Lists',
-
'Mean # of Lists Newly Entered', 'Median # of Lists Newly Entered', 'Mode # of Lists Newly Entered']
-
report.each_pair do |date, hash|
-
csv << [date, asins.count, report[date].values.sum, "#{'%.2f' % (report[date].values.sum.to_f / asins.count * 100.0)}%",
-
report[date].values.mean, report[date].values.median, report[date].values.mode]
-
end
-
csv.close
-
end
-
-
1
def self.number_of_new_amazon_reviews(asin, start_date, end_date)
-
book_version = WarehouseBookVersion.find_by(asin: asin, tld: '.com')
-
warehouse_date_ids = WarehouseDate.where(date: (start_date - 1.day)..end_date).value_of(:id)
-
stats = book_version.warehouse_stats.includes(:warehouse_date).where(warehouse_date_id: warehouse_date_ids).joins(:warehouse_date).order('warehouse_dates.date ASC')
-
-
if stats.present?
-
stats[1..-1].each_with_index.each_with_object({}) do |stat_and_index, hash|
-
current_count = stat_and_index[0].amazon_review_count || 0
-
day_before_count = stats[stat_and_index[1]].amazon_review_count || 0
-
hash[stat_and_index[0].warehouse_date.date] = (current_count) - (day_before_count)
-
end
-
else
-
{}
-
end
-
end
-
-
1
def self.number_of_new_amazon_reviews_report(asins, start_date, end_date)
-
dates = start_date..end_date
-
data = asins.each_with_object({}) do |asin, hash|
-
review_counts = DataAnalysis.number_of_new_amazon_reviews(asin, dates.first, dates.last)
-
review_counts.each_pair do |date, count|
-
hash[date] ||= []
-
hash[date] << count
-
end
-
end
-
-
csv = CSV.open('./new_amazon_reviews.csv', 'wb')
-
csv << ['Date', 'Total Books', 'Total Books With New Reviews', 'Average # of Books with New Reviews', 'Mean New Review Count', 'Median New Review Count', 'Mode New Review Count']
-
dates.each do |date|
-
csv << [date, asins.count, data[date].compact.count {|x| x > 0}, "#{'%.2f' % (data[date].compact.count {|x| x > 0}.to_f / asins.count * 100.0)}%",
-
data[date].compact.select {|x| x > 0}.mean, data[date].compact.select {|x| x > 0}.median, data[date].compact.select {|x| x > 0}.mode]
-
end
-
csv.close
-
end
-
end
-
1
module DataCleanup
-
1
extend self
-
-
1
def populate_missing_isbn13_from_api(book_version)
-
3
if book_version.isbn13.blank? && book_version.update_amazon_api_response == :ready_for_amazon_ingestion
-
3
item = book_version.amazon_api_response.matching_response_item
-
3
if item.class != Symbol && item.isbn13.present?
-
2
book_version.isbn13 = item.isbn13
-
2
if WarehouseBookVersion.where(isbn13: book_version.isbn13, tld: book_version.tld).exists?
-
2
Rails.logger.tagged('cleanup') {Rails.logger.info "BookVersion #{book_version.id} ASIN: #{book_version.asin} matches existing isbn13: #{book_version.isbn13}"}
-
1
book_version.duplicate_key = book_version.isbn13
-
1
book_version.isbn13 = nil
-
1
book_version.status = :duplicate_isbn13
-
else
-
2
Rails.logger.tagged('cleanup') {Rails.logger.info "Updated BookVersion #{book_version.id} ASIN: #{book_version.asin} to isbn13: #{book_version.isbn13}"}
-
end
-
2
book_version.save
-
else
-
2
Rails.logger.tagged('cleanup') {Rails.logger.info "BookVersion #{book_version.id} ASIN: #{book_version.asin} found no isbn13. Self published?"}
-
end
-
end
-
end
-
-
# TODO: Needs to be fixed, author_name = book_version.title ahahahahahahahahaha
-
#def find_ean_from_asin(book_version)
-
# # If book version has no isbn13 or bn_id, proceed to matching to convert ASIN -> EAN (which is either an ISBN13 or BN ID)
-
# ean = nil
-
# book_format = book_version.book_format
-
# title = book_version.title
-
# author_name = book_version.title
-
#
-
# # If book_version has type, title and author_name proceed
-
# if book_format.present? && title.present? && author_name.present?
-
# amazon_page = AmazonProductPage.new book_version.amazon_url
-
# ean = amazon_page.scrape_isbn_13
-
#
-
# # If format isn't of these 3 types, our only option is to find the isbn13 on the page so return it immediately
-
# return ean if book_format.exclude?('Hardcover') && book_format.exclude?('Paperback') && book_format.exclude?('Kindle')
-
#
-
# #TODO: consider switching order of the next 2 methods if one proves to be more reliable than the other
-
# # check if any BN search results show same pub date as book version
-
# if book_version.pub_date.present? && ean.blank?
-
# bn_search_page = BnSearchPage.by_title_and_author_and_book_format book_version.title, book_version.author_name, book_version.book_format
-
# ean = bn_search_page.ean_for book_version.pub_date
-
# end
-
#
-
# # if no pub date match exists go through amazon physical method and get isbn13 for a physical version
-
# # only do this when trying to find a matching Nook for a Kindle title
-
# #TODO: clean this up to work for all book formats if it seems promising
-
# ean = DataCleanup.find_isbn13_by_amazon_physical_method amazon_page if ean.blank? && book_format.include?('Kindle')
-
# else
-
# Rails.logger.tagged('cleanup') {Rails.logger.info "Book Version #{book_version.id} is missing data (book_format, book, or book author) and matching can't proceed"}
-
# end
-
#
-
# ean
-
#end
-
-
1
def find_isbn13_by_amazon_physical_method(page)
-
1
isbn13s = page.scrape_physical_isbn13s
-
-
1
isbn13s.each do |isbn13|
-
1
bn_page = BnBookPage.by_ean(isbn13)
-
1
if bn_page.ok?
-
1
ean = bn_page.scrape_nook_ean
-
1
return ean if ean.present?
-
end
-
end
-
end
-
end
-
1
require 'csv'
-
1
require 'net/ftp'
-
1
require 'zip'
-
-
1
module EnterpriseReports
-
1
INVALID_CHARS_MAPPING = {"¨" => ' ', "ʺ" => '""', "˝" => '""', "ˮ" => '""', "Ҍ" => ' ', "ҍ" => ' ', "״" => '""', "“" => '""', "”" => '""', "‟" => '""', "″" => '""', "⠐" => ' ', "〃" => ' ', "々" => ' ', "ゝ" => ' ', "ゞ" => ' ', "ヽ" => ' ', "ヾ" => ' '}
-
1
extend self
-
-
1
def sql_copy_to_csv_and_deliver_report(sql, client_name, base_filename, expected_count, ftp: false, emailable_report_name: nil, gzip: false)
-
6
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_class}(#{client_name}) Report Started -----"}
-
-
3
csv = sql_copy_to_csv("#{client_name}-#{emailable_report_name}",
-
get_report_location(client_name, get_filename(base_filename)),
-
WarehouseStat.connection.raw_connection,
-
sql)
-
-
-
3
csv = gzip_report(csv, client_name) if gzip
-
3
move_to_s3(client_name, csv)
-
-
# Determine if report sending is valid based on counts then ftp/set redis/email
-
3
row_count = $redis.hgetall('daily_report_stats')["#{client_name}-#{emailable_report_name}-row-count"].to_f
-
3
if expected_count.nil? || EnterpriseReports.report_count_valid?(row_count, expected_count)
-
2
ftp_to_client(client_name, csv) if ftp
-
2
EnterpriseReportsMailer.basic_report(generate_report_hash(base_filename, client_name, gzip), get_report_email_details(client_name, emailable_report_name)).deliver if emailable_report_name.present?
-
4
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_class}(#{client_name}) Report Delivered -----"}
-
else
-
1
EnterpriseReports.send_report_count_error base_filename, row_count, expected_count
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_class}(#{client_name}) Report NOT Delivered: row count off by 5% of more -----"}
-
end
-
end
-
-
1
def report_count_valid?(row_count, expected_count)
-
13
((expected_count - row_count) / expected_count.to_f).abs < 0.05
-
end
-
-
1
def send_report_count_error(report_identifier, row_count, expected_count)
-
1
NotificationMailer.report_row_count_error(report_identifier, "#{report_identifier} - Report blocked from sending and FTPing because todays count: #{row_count} was too far off the expected count: #{expected_count}").deliver
-
end
-
-
1
def get_filename(base_filename)
-
40
"#{base_filename}.csv"
-
end
-
-
1
def get_report_location(client_name, filename)
-
40
File.join(AmazeBot.config[:reports][:location][Utilities.env], client_name.to_s, filename)
-
end
-
-
1
def get_report_email_details(client_name, report_name)
-
4
AmazeBot.config[:reports][:clients][client_name][:reports][report_name].with_indifferent_access
-
end
-
-
1
def generate_report_hash(base_filename, client_name, gzip = false)
-
37
filename = get_filename base_filename
-
37
filename += '.gz' if gzip
-
{
-
base_filename: base_filename,
-
filename: filename,
-
report_location: get_report_location(client_name, filename),
-
report_format: 'csv',
-
client_name: client_name
-
37
}
-
end
-
-
1
def move_to_s3(client_name, file, extra_folders = [])
-
4
uploader = ReportUploader.new
-
4
uploader.client_name = client_name
-
4
uploader.add_folders_to_store_dir(extra_folders) if extra_folders.present?
-
4
file.flush
-
2
uploader.store! file
-
-
2
File.join 'https://s3.amazonaws.com', AmazeBot.config[:carrier_wave][:fog_directory][Utilities.env], uploader.store_dir, File.basename(file.path)
-
end
-
-
1
def sql_copy_to_csv(report_key, report_location, raw_connection, sql)
-
5
dirname = File.dirname report_location
-
5
begin
-
5
Dir.mkdir dirname unless File.exists? dirname
-
4
csv = File.open report_location, 'wb'
-
rescue Errno::EEXIST
-
# open the file as normal if the dir already exists. This means another process on the same dyno
-
# created it already for this user
-
1
csv = File.open report_location, 'wb'
-
end
-
-
5
row_count = 0
-
5
$redis.hmset('daily_report_stats', "#{report_key}-row-count", row_count, "#{report_key}-send-time", "Not Sent Yet")
-
-
5
raw_connection.exec("COPY (#{sql}) TO STDOUT WITH CSV HEADER FORCE QUOTE *;")
-
15
while !(data = raw_connection.get_copy_data).nil?
-
5
row_count += 1
-
-
# coerce data to ASCII for RH and remove newlines (and duplicate newlines) except for last one (this regex specifically ignores last /n)
-
# also remove Windows carriage returns (\r) and escaped newlines (\\n) in case RH has any issues processing those as well
-
# also manually handle non-unicode quote characters by coercing them to escaped quotes and anything else that
-
# maps to a quote character to a space.
-
5
csv << data.to_ascii(INVALID_CHARS_MAPPING).gsub(/\\n+|\r+|\n+(?!$)/, '')
-
-
5
if row_count % 10000 == 0
-
10
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_class}(#{report_key}) Copy to CSV wrote: #{row_count} lines -----"}
-
5
$redis.hmset('daily_report_stats', "#{report_key}-row-count", row_count, "#{report_key}-send-time", "Not Sent Yet")
-
end
-
end
-
-
5
csv.flush
-
5
$redis.hmset('daily_report_stats', "#{report_key}-row-count", row_count, "#{report_key}-send-time", Time.current.to_s)
-
10
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_class}(#{report_key}) Copy to CSV completed: #{row_count} lines-----"}
-
-
5
csv
-
end
-
-
1
def open_csv(report_hash)
-
3
dirname = File.dirname report_hash[:report_location]
-
3
begin
-
3
Dir.mkdir dirname unless File.exists? dirname
-
2
CSV.open(report_hash[:report_location], 'wb')
-
1
rescue Errno::EEXIST
-
# open the file as normal if the dir already exists. This means another process on the same dyno
-
# created it already for this user
-
1
CSV.open(report_hash[:report_location], 'wb')
-
end
-
end
-
-
1
def unzip_file (file, destination)
-
1
Zip::File.open(file) { |zip_file|
-
1
zip_file.each { |f|
-
1
f_path=File.join(destination, f.name)
-
1
FileUtils.mkdir_p(File.dirname(f_path))
-
1
zip_file.extract(f, f_path) unless File.exist?(f_path)
-
}
-
}
-
end
-
-
1
def ftp_to_client(client_name, csv)
-
3
if Rails.env.production?
-
1
if client_name.to_sym == :rhinc
-
# FTP CSV File
-
1
Net::FTP.open('ftp.randomhouse.com', 'booklr', 'B00Klr') do |ftp|
-
1
ftp.passive = true
-
1
ftp.chdir 'to_rh'
-
1
ftp.putbinaryfile csv.path
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Report Delivered to RH FTP Server (#{csv.path})-----"}
-
end
-
-
# FTP Completion file
-
1
file = File.open('/tmp/ALLFILES.DONE', 'w')
-
1
Net::FTP.open('ftp.randomhouse.com', 'booklr', 'B00Klr') do |ftp|
-
1
ftp.passive = true
-
1
ftp.chdir 'to_rh'
-
1
ftp.putbinaryfile file.path
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Report Delivered to RH FTP Server (#{file.path})-----"}
-
end
-
1
File.delete(file)
-
1
file.close
-
end
-
end
-
end
-
-
1
def ftp_to_rhpg(report_location)
-
1
if Rails.env.production?
-
1
Net::FTP.open('ftp.randomhouse.com', 'booklr', 'B00Klr') do |ftp|
-
1
ftp.passive = true
-
1
ftp.chdir 'to_rh'
-
1
ftp.putbinaryfile report_location
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Report Delivered to RH FTP Server (#{report_location})-----"}
-
end
-
end
-
end
-
-
1
def output_customer_behavior_fields(stat_hash, type_string)
-
679
stat_hash.keys.select{|k| k.include? type_string}.collect{|key| key.downcase.include?("price") ? (stat_hash[key].present? ? (stat_hash[key].to_i / 100.0) : nil) : stat_hash[key]}
-
end
-
-
1
def output_field_and_change_and_percentage_change(stat, last_week_stat, field, float=false)
-
13
value = float == true ? stat[field].to_f : stat[field].to_i if stat.has_key?(field) && stat[field].present?
-
13
last_value = float == true ? last_week_stat[field].to_f : last_week_stat[field].to_i if last_week_stat.has_key?(field) && last_week_stat[field].present?
-
-
13
if value.present? && last_value.present?
-
8
change = value - last_value
-
8
percent_change = percent_change_from(last_value, value)
-
8
return [value, change, percent_change]
-
end
-
5
[value, nil, nil]
-
end
-
-
1
def prepend_serialized_data(obj, size)
-
2
data = obj.present? ? yield(obj) : Array.new(size)
-
2
data = Array.new(size - data.size) + data if data.size < size
-
2
data
-
end
-
-
1
def pad_serialized_data(obj, size)
-
702
data = obj.present? ? yield(obj) : Array.new(size)
-
702
data = data + Array.new(size - data.size) if data.size < size
-
702
data
-
end
-
-
1
def isbn_output(isbn)
-
5
"=\"#{isbn}\""
-
end
-
-
1
def percent_change_from_for_rank(old_val, new_val)
-
9
((old_val - new_val) * 100.0 / new_val).round(2) if old_val.present? && new_val.present? && new_val != 0
-
end
-
-
1
def percent_change_from(old_val, new_val)
-
13
return nil if old_val.blank? || new_val.blank?
-
13
return 0.0 if old_val == new_val
-
3
((new_val - old_val) * 100.0 / old_val).round(2) if old_val != 0
-
end
-
-
1
def get_rhpg_exception_report_row(tracked_book_version)
-
6
if tracked_book_version.warehouse_book_version.title.present? && tracked_book_version.warehouse_book_version.author_name.present?
-
1
[tracked_book_version.warehouse_book_version.title, tracked_book_version.warehouse_book_version.author_name,
-
tracked_book_version.warehouse_book_version.isbn_or_asin]
-
5
elsif tracked_book_version.metadata.present?
-
2
if tracked_book_version.metadata.class == String
-
1
metadata_array = tracked_book_version.metadata.gsub(/^"|"$/, '').split('","')
-
1
metadata_array.count > 1 ? [metadata_array[0], metadata_array[1], tracked_book_version.warehouse_book_version.isbn_or_asin] : ['No data', 'No data', tracked_book_version.warehouse_book_version.isbn_or_asin]
-
else
-
1
metadata_array = tracked_book_version.metadata
-
1
metadata_array.count > 1 ? [metadata_array[0], metadata_array[1], tracked_book_version.warehouse_book_version.isbn_or_asin] : ['No data', 'No data', tracked_book_version.warehouse_book_version.isbn_or_asin]
-
end
-
else
-
3
['No data', 'No data', tracked_book_version.warehouse_book_version.isbn_or_asin]
-
end
-
end
-
-
1
def get_rhinc_exception_report_row(tracked_book_version, book_version_exception)
-
6
[tracked_book_version.get_book_title, tracked_book_version.get_author_name,
-
tracked_book_version.warehouse_book_version.isbn_or_asin, tracked_book_version.get_book_version_asin,
-
tracked_book_version.get_book_format, tracked_book_version.get_division_code,
-
book_version_exception.created_at, report_boolean_output(book_version_exception.amazon_not_found_in_search),
-
report_boolean_output(book_version_exception.amazon_ambiguous_result),
-
report_boolean_output(book_version_exception.amazon_no_buy_button),
-
report_boolean_output(book_version_exception.amazon_no_price),
-
report_boolean_output(book_version_exception.bn_not_found_in_search),
-
report_boolean_output(book_version_exception.apple_invalid)]
-
end
-
-
1
def report_boolean_output(field)
-
50
field ? 'X' : ''
-
end
-
-
1
def gzip_report(csv, client_name)
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_class}(#{client_name}) starting GZIP process -----"}
-
1
`gzip #{csv.path}`
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_class}(#{client_name}) GZIP Finished -----"}
-
-
1
File.open("#{csv.path}.gz")
-
end
-
-
1
def report_class
-
20
self.class == Class ? self.to_s : self.class.to_s
-
end
-
end
-
1
module Formulas
-
1
def self.average_growth_rate(beginning_value, end_value, period_length, growth = :positive)
-
2
if beginning_value.present? && end_value.present? && period_length > 0
-
2
if growth == :positive
-
1
((end_value.to_f / beginning_value.to_f) ** (1.0 / period_length.to_f)) - 1.0
-
else
-
1
((beginning_value.to_f / end_value.to_f) ** (1.0 / period_length.to_f)) - 1.0
-
end
-
end
-
end
-
end
-
1
module IngestionQueue
-
1
BATCH_SIZE = 10
-
-
1
def self.queue_book_versions(number_of_api_calls)
-
# Queue up all orphaned ingestions
-
3
IngestionQueue.handle_orphaned_book_versions
-
-
# In order of priority cascade the number_of_api_calls down each method till you run out of ingestions for that block
-
3
count = 0
-
6
grouped_ids = WarehouseBookVersion.where(status: %w[validated validated_from_top_100s], tld: Utilities::TLDS).where{(warehouse_book_versions.asin != nil) | (warehouse_book_versions.isbn13 != nil)}.order(:tld, :isbn13).limit(number_of_api_calls * AmazonApiWorkers::GetManyApiResponses::BATCH_SIZE).value_of(:tld, :id, :isbn13).each_with_object({}) do |warehouse_book_version_values, hash|
-
44
hash[warehouse_book_version_values[0]] ||= {}
-
44
if warehouse_book_version_values[2].present?
-
42
hash[warehouse_book_version_values[0]][:isbn13] ||= []
-
42
hash[warehouse_book_version_values[0]][:isbn13] << warehouse_book_version_values[1]
-
else
-
2
hash[warehouse_book_version_values[0]][:asin] ||= []
-
2
hash[warehouse_book_version_values[0]][:asin] << warehouse_book_version_values[1]
-
end
-
end
-
-
3
Utilities::TLDS.each do |tld|
-
6
[:isbn13, :asin].each do |key_column|
-
12
if count < number_of_api_calls && grouped_ids[tld].present? && grouped_ids[tld][key_column].present?
-
6
grouped_ids[tld][key_column].each_slice(AmazonApiWorkers::GetManyApiResponses::BATCH_SIZE) do |warehouse_book_version_ids|
-
8
timestamp = Time.current
-
8
WarehouseBookVersion.where(id: warehouse_book_version_ids).update_all(status: :api_call_in_progress, updated_at: timestamp)
-
8
AmazonApiWorkers::GetManyApiResponses.perform_async warehouse_book_version_ids, key_column, tld
-
8
count += 1
-
8
break if count == number_of_api_calls
-
end
-
end
-
end
-
end
-
-
6
Rails.logger.tagged('ingestions') {Rails.logger.info "Completed queuing #{count} book version ingestions"} if count > 0
-
3
count
-
end
-
-
1
def self.handle_orphaned_book_versions
-
# Try and validate all books stuck in new state
-
12
WarehouseBookVersion.where{(status == 'new') & (updated_at < Time.current - 10.minutes)}.order(:id).value_of(:id).each do |warehouse_book_version_id|
-
1
BookVersionValidationWorkers::ValidateNewBookVersion.perform_async warehouse_book_version_id
-
end
-
-
# Ingest all books ready for ingestion since API calls are already done
-
12
WarehouseBookVersion.where{(status == 'ready_for_amazon_ingestion') & (updated_at < Time.current - 10.minutes)}.order(:id).value_of(:id).each do |warehouse_book_version_id|
-
1
BookVersionWorkers::Ingest.perform_async warehouse_book_version_id
-
end
-
-
# Set all api_call_in_progress status books to validated if they get stuck in that state
-
12
WarehouseBookVersion.where{(status == 'api_call_in_progress') & (updated_at < Time.current - 10.minutes)}.update_all(status: "validated", updated_at: Time.current)
-
end
-
-
1
def self.ingestions_available
-
4
WarehouseBookVersion.where(status: 'validated').where{(warehouse_book_versions.isbn13 != nil) | (warehouse_book_versions.asin != nil)}.count
-
end
-
end
-
1
module ItunesApi
-
1
def self.get_metadata_by_isbn13(isbn13)
-
4
metadata = {}
-
4
base_url = "http://itunes.apple.com/lookup?isbn=#{isbn13}"
-
-
4
%w[us gb au].each do |region|
-
7
url = region == "us" ? base_url : base_url + "&country=#{region}"
-
7
result = HttpHelper.get_json(url)
-
7
if result.present? && result['resultCount'] > 0
-
3
data = result['results'].first
-
3
metadata[:itunes_id] = data['trackId'].to_s
-
3
metadata[:itunes_pub_date] = ScraperUtilities.parse_date_string(data['releaseDate'])
-
3
metadata[:itunes_genres] = data['genres'].join(', ').first(255)
-
3
break
-
end
-
end
-
-
4
metadata
-
end
-
-
1
def self.get_itunes_data(itunes_id)
-
2
results = HttpHelper.get_json("http://itunes.apple.com/lookup?id=#{itunes_id}")
-
2
data = HashWithIndifferentAccess.new
-
2
if results.present? && results['resultCount'] > 0
-
1
result = results['results'].first
-
1
data[:itunes_price] = result['price'].to_s.gsub('.','')
-
1
data[:itunes_average_rating] = result['averageUserRating']
-
1
data[:itunes_rating_count] = result['userRatingCount']
-
end
-
-
2
data
-
end
-
-
1
def self.get_regional_itunes_data(itunes_id)
-
2
%w[gb au].each_with_object(HashWithIndifferentAccess.new) do |region, results|
-
4
result = HttpHelper.get_json("http://itunes.apple.com/lookup?id=#{itunes_id}&country=#{region}")
-
4
if result.present? && result['resultCount'] > 0
-
2
data = result['results'].first
-
2
results["itunes_#{region}_price"] = data['price'].to_s.gsub('.','')
-
2
results["itunes_#{region}_average_rating"] = data['averageUserRating']
-
2
results["itunes_#{region}_rating_count"] = data['userRatingCount']
-
end
-
end
-
end
-
end
-
1
module ListStatWorkers
-
1
class QueueAmazonTop100sBlock
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(block_size = 2000)
-
1
collection_name = MongoUtilities.daily_collection_name(:amazon_list_stats)
-
1
params = WarehouseCategory.amazon.com.canonical.order(:id).value_of(:id).each_slice(block_size).collect do |id_slice|
-
3
[id_slice.first, id_slice.last, collection_name]
-
end
-
1
Sidekiq::Client.push_bulk('class' => ListStatWorkers::QueueAmazonTop100s, 'args' => params)
-
end
-
end
-
-
1
class QueueAmazonTop100s
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(starting_warehouse_category_id, ending_warehouse_category_id, collection_name)
-
1
params = WarehouseCategory.amazon.com.canonical.where(id: starting_warehouse_category_id..ending_warehouse_category_id).order(:id).value_of(:category_id, :tld, :name, :id).collect do |category_id, tld, name, id|
-
2
[category_id, tld, ScraperUtilities.base_category_from_category_name(name), 1, id, collection_name]
-
end
-
1
Sidekiq::Client.push_bulk('class' => MongoListStatWorkers::AmazonTop100, 'args' => params)
-
end
-
end
-
-
1
class QueueBarnesNobleTop100Scrape
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
1
collection_name = MongoUtilities.daily_collection_name(:bn_list_stats)
-
1
params = MongoListStatWorkers::BarnesNobleTop100::LIST_NAMES.values.collect do |list_name|
-
2
[list_name, collection_name]
-
end
-
1
Sidekiq::Client.push_bulk('class' => MongoListStatWorkers::BarnesNobleTop100, 'args' => params)
-
end
-
end
-
-
1
class QueueAppleTopBooksScrape
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
1
collection_name = MongoUtilities.daily_collection_name(:apple_list_stats)
-
1
values = WarehouseCategory.apple.order(:id).value_of(:category_id, :id).each_with_object([]) do |category_id_and_id, array|
-
2
array << [category_id_and_id[0], 'paid', category_id_and_id[1], collection_name]
-
2
array << [category_id_and_id[0], 'free', category_id_and_id[1], collection_name]
-
end
-
1
Sidekiq::Client.push_bulk('class' => MongoListStatWorkers::AppleTopBooksFeed, 'args' => values)
-
end
-
end
-
end
-
1
class Contributor
-
1
include SAXMachine
-
1
element :b036, :as => :author
-
end
-
-
1
class Title
-
1
include SAXMachine
-
1
element :b203, :as => :main_title
-
1
element :b029, :as => :sub_title
-
end
-
-
1
class WorkIdentifier
-
1
include SAXMachine
-
1
element :b244, :as => :work_id
-
end
-
-
1
class SalesRights
-
1
include SAXMachine
-
1
element :b089, :as => :sales_right_type
-
1
element :b090, :as => :country_codes
-
1
element :b388, :as => :territory_codes
-
end
-
-
1
class Product
-
1
include SAXMachine
-
1
element :a002, :as => :status
-
1
element :a001, :as => :isbn13
-
1
element :b394, :as => :publishing_status
-
1
element :b243, :as => :division_code
-
1
element :title, :as => :title, :class => Title
-
1
element :workidentifier, :as => :workidentifier, :class => WorkIdentifier
-
1
elements :contributor, :as => :contributors, :class => Contributor
-
1
elements :salesrights, :as => :salesrights, :class => SalesRights
-
end
-
-
1
module MergePurge
-
1
def self.reconcile_onix_xml(xml, user)
-
16
Rails.logger.tagged('onix') {Rails.logger.info "--- ONIX Merge purge run for ISBNs in #{File.basename(xml)} for #{user.name} ---"}
-
-
# Setup
-
8
isbns_to_add = []
-
8
isbns_to_remove = []
-
8
metadata = {}
-
8
user_existing_book_versions = user.warehouse_book_versions.value_of(:id, :isbn13)
-
8
user_existing_isbns = user_existing_book_versions.collect(&:last).compact #remove all nil isbn13s
-
8
user_existing_book_version_ids = user_existing_book_versions.collect(&:first)
-
-
8
if user.email.include? 'rhde@booklr.com'
-
2
MergePurge.process_rhde_onix_feed(xml, isbns_to_add, isbns_to_remove, metadata)
-
else
-
6
MergePurge.process_rhinc_onix_feed(xml, isbns_to_add, isbns_to_remove, metadata)
-
end
-
-
# Additions
-
8
existing_book_versions_to_add = WarehouseBookVersion.where(isbn13: isbns_to_add, tld: user.tld).value_of(:id, :isbn13)
-
8
existing_book_versions_isbns_to_add = existing_book_versions_to_add.collect(&:last)
-
-
16
Rails.logger.tagged('onix') {Rails.logger.info "The following ISBNs already exist on their list: #{(existing_book_versions_isbns_to_add + user_existing_isbns).join(', ')}"}
-
-
8
creation_isbns = isbns_to_add - existing_book_versions_isbns_to_add
-
16
Rails.logger.tagged('onix') {Rails.logger.info "Creating the following new ISBNs (#{creation_isbns.count}): #{creation_isbns.join(', ')}"}
-
-
11
creation_ids = creation_isbns.collect{|isbn| WarehouseBookVersion.create(isbn13: isbn, source: user.email, tld: user.tld).id}
-
16
Rails.logger.tagged('onix') {Rails.logger.info 'Completed creating new Book Versions'}
-
-
8
existing_book_version_ids_to_add = existing_book_versions_to_add.collect(&:first)
-
-
8
book_version_ids_to_add = creation_ids + (existing_book_version_ids_to_add - user_existing_book_version_ids)
-
-
# Removals
-
8
existing_book_versions_to_remove = WarehouseBookVersion.where(isbn13: isbns_to_remove, tld: user.tld).value_of(:id, :isbn13)
-
8
existing_book_versions_isbns_to_remove = existing_book_versions_to_remove.collect(&:last)
-
-
16
Rails.logger.tagged('onix') {Rails.logger.info "The following ISBNs trying to be removed do not exist in the database: #{(isbns_to_remove - existing_book_versions_isbns_to_remove).join(', ')}"}
-
16
Rails.logger.tagged('onix') {Rails.logger.info "The following ISBNs trying to be removed do not exist on their list: #{(existing_book_versions_isbns_to_remove - user_existing_isbns).join(', ')}"}
-
-
8
book_version_ids_to_remove = existing_book_versions_to_remove.collect(&:first)
-
-
# Batch insert new tracked book versions if any
-
8
if book_version_ids_to_add.present?
-
3
columns = %w[user_id warehouse_book_version_id metadata created_at updated_at]
-
3
values = []
-
3
timestamp = Time.current
-
-
# Get all the ids we're adding and map them to their corresponding isbn13 so we can use that to pull from the metadata hash
-
6
mapping = WarehouseBookVersion.where(id: book_version_ids_to_add).value_of(:id, :isbn13).each_with_object({}) {|vals, hash| hash[vals.first] = vals.last}
-
-
3
book_version_ids_to_add.each do |book_version_id|
-
3
values << [user.id, book_version_id, metadata[mapping[book_version_id]].to_yaml, timestamp, timestamp]
-
end
-
-
3
TrackedBookVersion.batch_insert(columns, values)
-
end
-
-
16
Rails.logger.tagged('onix') {Rails.logger.info "Removing the following new ISBNs (#{existing_book_versions_isbns_to_remove.count}): #{existing_book_versions_isbns_to_remove.join(', ')}"}
-
-
# Batch delete all the removals if any
-
8
TrackedBookVersion.delete_all(warehouse_book_version_id: book_version_ids_to_remove, user_id: user.id) if book_version_ids_to_remove.present?
-
-
16
Rails.logger.tagged('onix') {Rails.logger.info "--- Completed associating new list for #{user.name} ---"}
-
end
-
-
1
def self.process_rhinc_onix_feed(xml, isbns_to_add, isbns_to_remove, metadata)
-
# Use Nokogiri Reader / Sax-Machine to handle parsing huge files that wont fit in memory
-
8
reader = Nokogiri::XML::Reader(xml)
-
8
while reader.read
-
384
if reader.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT and reader.name == 'product'
-
12
product = Product.parse(reader.outer_xml)
-
-
# status of 02 or 03 means an add or change, status of 05 means an outright removal from the system
-
12
if product.status == '02' || product.status == '03'
-
# publishing_status of 02 means forthcoming title, 04 means active title so add these
-
# any other status means the title is to be removed if it exists on their list currently
-
7
if product.publishing_status == '02' || product.publishing_status == '04'
-
4
isbns_to_add << product.isbn13
-
-
4
title = product.title.sub_title.present? ? (product.title.main_title + ' - ' + product.title.try(:sub_title)) : product.title.main_title
-
4
author = product.contributors.first.author if product.contributors.first.present?
-
4
metadata[product.isbn13] = ['', title, author, product.division_code]
-
else
-
3
isbns_to_remove << product.isbn13
-
end
-
elsif product.status == '05'
-
5
isbns_to_remove << product.isbn13
-
end
-
end
-
end
-
end
-
-
1
def self.process_rhde_onix_feed(xml, isbns_to_add, isbns_to_remove, metadata)
-
4
work_id_array = []
-
4
count = 0
-
-
# Use Nokogiri Reader / Sax-Machine to handle parsing huge files that wont fit in memory
-
4
reader = Nokogiri::XML::Reader(xml)
-
4
while reader.read
-
308
if reader.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT and reader.name == 'product'
-
8
product = Product.parse(reader.outer_xml)
-
-
# status of 02 or 03 means an add or change, status of 05 means an outright removal from the system
-
8
if product.status == '02' || product.status == '03'
-
# publishing_status of 02 means forthcoming title, 04 means active title so add these
-
# any other status means the title is to be removed if it exists on their list currently
-
6
if product.publishing_status == '02' || product.publishing_status == '04'
-
4
count+=1
-
4
p count if count % 500 == 0
-
-
4
isbns_to_add << product.isbn13
-
-
4
title = product.title.sub_title.present? ? (product.title.main_title + ' - ' + product.title.try(:sub_title)) : product.title.main_title
-
4
author = product.contributors.first.author if product.contributors.first.present?
-
4
metadata[product.isbn13] = ['', title, author, product.division_code]
-
-
# capture work_id into a hash for insertion into mongo
-
4
work_id_array << {_id: product.isbn13, work_id: product.workidentifier.work_id}.with_indifferent_access if product.workidentifier.present?
-
else
-
2
isbns_to_remove << product.isbn13
-
end
-
elsif product.status == '05'
-
2
isbns_to_remove << product.isbn13
-
end
-
end
-
end
-
-
# work_id_collection contains additions for every delta we process, our mongo collection will hold all mappings whether they are removed or not from their list
-
4
if work_id_array.present?
-
# this will work when we upgrade mongo/mongo driver?
-
#MongoUtilities.work_id_collection.insert(work_id_array, {continue_on_error: true, collect_on_error: true})
-
-
# in the mean time pull the whole collection into memory, diff and then insert the diff
-
4
work_id_mapping = []
-
4
MongoUtilities.work_id_collection.find.each {|x| work_id_mapping << x.with_indifferent_access}
-
4
diff = work_id_array - work_id_mapping
-
4
if diff.present?
-
8
isbns = diff.collect{|x| x['_id']}
-
-
# remove all existing documents from the collection that coincide with the diff (in case a work_id has been updated), then insert
-
4
MongoUtilities.work_id_collection.remove({'_id' => {'$in' => isbns}})
-
4
MongoUtilities.work_id_collection.insert diff
-
end
-
end
-
end
-
-
1
def self.reconcile_asin_and_isbn13_csv_no_metadata(csv, user)
-
invalid = []
-
incoming_isbns = []
-
incoming_asins = []
-
user_existing_book_versions = user.warehouse_book_versions.value_of(:id, :asin, :isbn13)
-
user_existing_isbns = user_existing_book_versions.collect(&:last).compact #remove all nil isbn13s
-
user_existing_asins = user_existing_book_versions.collect(&:second).compact #remove all nil asins
-
user_existing_book_version_ids = user_existing_book_versions.collect(&:first)
-
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Merge purge run for keys in #{File.basename(csv)} for #{user.name} ---"}
-
-
# CSV Processor is split out in case we have to have different processors for different users
-
CSV.foreach(csv, :quote_char => '"', :col_sep =>',', :row_sep =>:auto) do |row|
-
if row[6].present? || row[7].present?
-
isbn13 = row[6].strip if row[6].present?
-
asin = row[7].strip if row[7].present?
-
if ISBN_Tools.is_valid_isbn13?(isbn13)
-
incoming_isbns << isbn13
-
elsif asin.present? && (asin.length == 9 || asin.length == 10)
-
# prepend 0 to asins that are 9 digits to make it a valid 10 isbn10. excel causes preceding 0s to be stripped.
-
incoming_asins << (asin.length == 9 ? '0'+asin : asin)
-
invalid << isbn13 if isbn13.present?
-
end
-
end
-
end
-
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "Found the following invalid ISBN13s #{invalid.join(', ')}"}
-
-
# process all the data
-
self.process_incoming_data(user, user_existing_isbns, user_existing_asins, incoming_isbns, incoming_asins, user_existing_book_version_ids)
-
end
-
-
1
def self.reconcile_asin_and_isbn13_array_no_metadata(data, user)
-
invalid = []
-
incoming_isbns = []
-
incoming_asins = []
-
user_existing_book_versions = user.warehouse_book_versions.value_of(:id, :asin, :isbn13)
-
user_existing_isbns = user_existing_book_versions.collect(&:last).compact #remove all nil isbn13s
-
user_existing_asins = user_existing_book_versions.collect(&:second).compact #remove all nil asins
-
user_existing_book_version_ids = user_existing_book_versions.collect(&:first)
-
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Merge purge run for keys from Vook Salesfroce for #{user.name} ---"}
-
-
data.each do |isbn, asin|
-
if isbn.present? || asin.present?
-
if ISBN_Tools.is_valid_isbn13?(isbn)
-
incoming_isbns << isbn
-
elsif asin.present?
-
incoming_asins << asin
-
invalid << isbn if isbn.present?
-
end
-
end
-
end
-
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "Found the following invalid ISBN13s #{invalid.join(', ')}"}
-
-
# process all the data
-
self.process_incoming_data(user, user_existing_isbns, user_existing_asins, incoming_isbns, incoming_asins, user_existing_book_version_ids)
-
end
-
-
1
def self.process_incoming_data(user, user_existing_isbns, user_existing_asins, incoming_isbns, incoming_asins, user_existing_book_version_ids)
-
-
user_existing_keys = user_existing_isbns + user_existing_asins
-
incoming_keys = incoming_isbns + incoming_asins
-
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "Adding: #{(incoming_keys - user_existing_keys).count} out of a total new list size of: #{incoming_keys.count}"}
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "Matched: #{(user_existing_keys & incoming_keys).count} from the incoming list with the existing list."}
-
-
existing_book_versions = WarehouseBookVersion.where{(asin >> incoming_asins) | (isbn13 >> incoming_isbns)}.where(tld: user.tld).value_of(:id, :status, :asin, :isbn13)
-
existing_book_versions_isbns = existing_book_versions.collect(&:last)
-
existing_book_versions_asins = existing_book_versions.collect(&:third)
-
existing_keys = existing_book_versions_isbns + existing_book_versions_asins
-
-
creation_keys = incoming_keys - existing_keys
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "Creating the following new keys: #{creation_keys.join(', ')}"}
-
-
creation_ids = creation_keys.collect{|key| (key.length == 13 ? WarehouseBookVersion.create(isbn13: key, source: user.email, tld: user.tld) : WarehouseBookVersion.create(asin: key, source: user.email, tld: user.tld)).id}
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "Completed creating new Book Versions #{creation_ids.join(', ')}"}
-
-
existing_book_version_ids = existing_book_versions.collect(&:first)
-
-
book_version_ids_to_add = creation_ids + (existing_book_version_ids - user_existing_book_version_ids)
-
book_version_ids_to_remove = user_existing_book_version_ids - existing_book_version_ids
-
-
# Batch insert new tracked book versions if any
-
if book_version_ids_to_add.present?
-
columns = %w[user_id warehouse_book_version_id created_at updated_at]
-
values = []
-
timestamp = Time.current
-
-
book_version_ids_to_add.each do |book_version_id|
-
values << [user.id, book_version_id, timestamp, timestamp]
-
end
-
-
TrackedBookVersion.batch_insert(columns, values)
-
end
-
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "Removing book version ids: #{(book_version_ids_to_remove).count} from the user list: #{book_version_ids_to_remove.join(', ')}"}
-
-
# Batch delete all the removals if any
-
TrackedBookVersion.delete_all(warehouse_book_version_id: book_version_ids_to_remove, user_id: user.id) if book_version_ids_to_remove.present?
-
-
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Completed associating new list for #{user.name} ---"}
-
end
-
-
1
def self.reconcile_csv(csv, user)
-
9
incoming = []
-
9
invalid = []
-
9
user_existing_book_versions = user.warehouse_book_versions.value_of(:id, :isbn13)
-
9
user_existing_isbns = user_existing_book_versions.collect(&:last).compact #remove all nil isbn13s
-
9
user_existing_book_version_ids = user_existing_book_versions.collect(&:first)
-
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Merge purge run for ISBNs in #{File.basename(csv)} for #{user.name} ---"}
-
-
# CSV Processor is split out in case we have to have different processors for different users
-
9
self.process_rhuk_or_rhpg_csv(csv, incoming, invalid)
-
-
# Sort the incoming list so it matches the sorted exists_uningested list below when zipping
-
9
incoming = incoming.sort_by(&:first)
-
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "Found the following invalid ISBN13s #{invalid.join(', ')}"}
-
-
9
incoming_isbns = incoming.collect(&:first)
-
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "Removing: #{(user_existing_isbns - incoming_isbns).count} book versions and adding: #{(incoming_isbns - user_existing_isbns).count} out of a total new list size of: #{incoming_isbns.count}"}
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "Matched: #{(user_existing_isbns & incoming_isbns).count} from the incoming_isbns list with the existing list."}
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "Removing the following ISBNs: #{(user_existing_isbns - incoming_isbns).join(', ')}"}
-
-
9
existing_book_versions = WarehouseBookVersion.where(isbn13: incoming_isbns, tld: user.tld).value_of(:id, :status, :asin, :isbn13)
-
9
existing_book_versions_isbns = existing_book_versions.collect(&:last)
-
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "Associating the following existing ISBNs: #{(existing_book_versions_isbns - user_existing_isbns).join(', ')}"}
-
-
9
creation_isbns = incoming_isbns - existing_book_versions_isbns
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "Creating the following new ISBNs: #{creation_isbns.join(', ')}"}
-
-
14
creation_ids = creation_isbns.collect{|isbn| WarehouseBookVersion.create(isbn13: isbn, source: user.email, tld: user.tld).id}
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "Completed creating new Book Versions #{creation_ids.join(', ')}"}
-
-
9
existing_book_version_ids = existing_book_versions.collect(&:first)
-
-
9
book_version_ids_to_add = creation_ids + (existing_book_version_ids - user_existing_book_version_ids)
-
9
book_version_ids_to_remove = user_existing_book_version_ids - existing_book_version_ids
-
-
# Batch insert new tracked book versions if any
-
9
if book_version_ids_to_add.present?
-
6
columns = %w[user_id warehouse_book_version_id created_at updated_at]
-
6
values = []
-
6
timestamp = Time.current
-
-
6
book_version_ids_to_add.each do |book_version_id|
-
6
values << [user.id, book_version_id, timestamp, timestamp]
-
end
-
-
6
TrackedBookVersion.batch_insert(columns, values)
-
end
-
-
# Batch delete all the removals if any
-
9
TrackedBookVersion.delete_all(warehouse_book_version_id: book_version_ids_to_remove, user_id: user.id) if book_version_ids_to_remove.present?
-
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Completed associating new list for #{user.name} ---"}
-
-
# Only populate metadata for uningested books, this includes the newly created books from above (order by isbn13 so zip matches)
-
18
existing_uningested_book_versions = WarehouseBookVersion.where{status != 'ingested'}.where(isbn13: incoming_isbns, tld: user.tld).order(:isbn13).value_of(:id, :isbn13)
-
9
existing_uningested_book_version_isbns = existing_uningested_book_versions.collect(&:last)
-
9
existing_uningested_book_version_ids = existing_uningested_book_versions.collect(&:first)
-
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Setting metadata for uningested books: #{existing_uningested_book_version_isbns.count} ---"}
-
-
# rows with no metadata will be set to []
-
54
metadata = incoming.select{|row| existing_uningested_book_version_isbns.include?(row.first)}.map {|x| x.drop 1}
-
-
9
existing_uningested_book_version_ids.zip(metadata).each do |book_version_id, metadata|
-
22
TrackedBookVersion.find_by(user_id: user.id, warehouse_book_version_id: book_version_id).update_attribute :metadata, metadata
-
end
-
18
Rails.logger.tagged('merge_purge') {Rails.logger.info '--- Completed setting metadata for uningested books ---'}
-
end
-
-
1
def self.process_rhuk_or_rhpg_csv(csv, incoming, invalid)
-
9
CSV.foreach(csv, :quote_char => '"', :col_sep =>',', :row_sep =>:auto, encoding: 'windows-1251:utf-8') do |row|
-
24
if row[0].present?
-
24
isbn13 = row[0].strip
-
24
asin = row[1].strip if row[1].present?
-
24
if ISBN_Tools.is_valid_isbn13?(isbn13)
-
# prepend 0 to asins that are 9 digits to make it a valid 10 isbn10. excel causes preceding 0s to be stripped.
-
55
incoming << row.map {|value| asin.present? && value == asin && asin.length == 9 ? '0'+ value : value}
-
else
-
1
invalid << isbn13
-
end
-
end
-
end
-
end
-
-
1
def self.update_metadata(xml, user)
-
3
metadata = []
-
3
count = 0
-
3
invalid = 0
-
3
reader = Nokogiri::XML::Reader(xml)
-
-
3
while reader.read
-
125
if reader.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT and reader.name == 'product'
-
3
product = Product.parse(reader.outer_xml)
-
3
if product.publishing_status == '02' || product.publishing_status == '04'
-
2
title = product.title.sub_title.present? ? (product.title.main_title + ' - ' + product.title.try(:sub_title)) : product.title.main_title
-
2
author = product.contributors.first.author if product.contributors.first.present?
-
2
metadata << {isbn13: product.isbn13, author: author, title: title, division_code: product.division_code}
-
else
-
1
invalid +=1
-
end
-
-
3
count +=1
-
3
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Processed #{count} products from the ONIX full XML found #{invalid} so far ---"} if count % 1000 == 0
-
end
-
end
-
-
3
count = 0
-
3
metadata.each do |data|
-
2
data[:id] = WarehouseBookVersion.where(isbn13: data[:isbn13], tld: user.tld).value_of(:id).first
-
2
count +=1
-
2
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Processed #{count} metadata objects and added BookVersion ids ---"} if count % 1000 == 0
-
end
-
-
3
count = 0
-
3
missing = 0
-
3
metadata.each do |metadata_item|
-
2
if metadata_item[:id].present?
-
# Original metadata format has asin as the first item so just leaving that blank for now
-
2
metadata_array = ['', metadata_item[:title], metadata_item[:author], metadata_item[:division_code]]
-
2
tracked_book_version = TrackedBookVersion.where(warehouse_book_version_id: metadata_item[:id], user_id: user.id).first
-
2
if tracked_book_version.present?
-
1
tracked_book_version.update_attributes(metadata: metadata_array)
-
else
-
2
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- #{metadata_item[:id]} BookVersion doesn't exist for this user, adding it ---"}
-
1
TrackedBookVersion.create(warehouse_book_version_id: metadata_item[:id], user_id: user.id, metadata: metadata_array)
-
1
missing += 1
-
end
-
end
-
-
2
count +=1
-
2
Rails.logger.tagged('merge_purge') {Rails.logger.info "--- Updated: #{count} @ #{Time.current} metadata fields, found #{missing} missing titles from the user list ---"} if count % 1000 == 0
-
end
-
end
-
end
-
1
module MongoUtilities
-
1
PLP_COLLECTION_NAME = 'print_list_prices'.freeze
-
1
ASIN_LIST_COLLECTION_NAME = 'all_asins'.freeze
-
1
WORK_ID_COLLECTION_NAME = 'isbn13_to_work_id'.freeze
-
1
DAILY_COLLECTION_TYPES = %w[stats amazon_list_stats bn_list_stats apple_list_stats book_version_exceptions
-
author_page_data de_competitive_format_data promotion_pages].freeze
-
-
1
extend self
-
-
1
def daily_collection_name(collection_type, date = Date.current)
-
18113
raise ArgumentError unless DAILY_COLLECTION_TYPES.include? collection_type.to_s
-
-
18111
"#{collection_type.to_s}_#{date.to_s.gsub('-', '')}"
-
end
-
-
1
def daily_collection(collection_type, date = Date.current)
-
18067
$mongodb.collection daily_collection_name(collection_type, date)
-
end
-
-
1
def work_id_collection
-
16
$mongodb.collection(MongoUtilities::WORK_ID_COLLECTION_NAME)
-
end
-
-
1
def repair_database
-
1
$mongodb.command repairDatabase: 1
-
end
-
-
1
def scheduler_collection
-
37
$mongodb.collection('scheduler_state')
-
end
-
-
1
def add_exception_to_collection(collection, warehouse_book_version_id, exceptions)
-
29
collection.update({_id: warehouse_book_version_id}, {'$set' => exceptions}, upsert: true) if exceptions.values.any? {|true_or_false| true_or_false}
-
end
-
-
1
def all_asin_document(asin, tld)
-
15
{asin: asin, tld: tld}
-
end
-
-
1
def exists_in_all_asin_list?(asin, tld)
-
9
$mongodb.collection(MongoUtilities::ASIN_LIST_COLLECTION_NAME).find(all_asin_document(asin, tld)).limit(1).first.present?
-
end
-
-
1
def add_documents_to_all_asin_list(all_asin_documents)
-
7
$mongodb.collection(MongoUtilities::ASIN_LIST_COLLECTION_NAME).insert(all_asin_documents, continue_on_error: true)
-
rescue Mongo::OperationFailure => e
-
2
raise e unless e.error_code == 11000 # Ignore duplicate key errors
-
end
-
-
1
def initialize_daily_scrape_field_counts(date_string)
-
6
MongoUtilities.daily_scrape_field_counts_collection.update({date: date_string}, {date: date_string}, {upsert: true})
-
end
-
-
1
def daily_scrape_field_counts_collection
-
265
$mongodb.collection 'daily_scrape_field_counts'
-
end
-
end
-
1
module PostgresConfiguration
-
1
@@hostname = YAML.load(File.read('./config/postgres_databases.yml')).with_indifferent_access
-
-
1
def self.configuration_hash_for(db_name)
-
2
uri = URI.parse(@@hostname[db_name.upcase])
-
{host: uri.host,
-
database: uri.path[1..-1],
-
username: uri.user,
-
password: uri.password,
-
port: uri.port,
-
adapter: 'postgresql',
-
encoding: 'utf8',
-
2
min_messages: 'WARNING'}
-
end
-
-
1
def self.change_database_connection(activerecord_model, db_name)
-
1
activerecord_model.establish_connection configuration_hash_for(db_name)
-
end
-
-
1
def self.get_raw_connection_for_db(db_name)
-
1
uri = URI.parse(@@hostname[db_name.upcase])
-
1
get_raw_connection uri.host, uri.port, uri.path[1..-1], uri.user, uri.password
-
end
-
-
1
def self.get_raw_connection(host, port, dbname, user, password)
-
1
PG::Connection.connect host, port, nil, nil, dbname, user, password
-
end
-
end
-
1
module PostgresUtilities
-
1
def self.fix_column_sequence(connection, table_name, column_name)
-
4
max_value = connection.execute("select max(#{column_name}) from #{table_name}").first['max']
-
4
new_val = connection.execute("select setval(pg_get_serial_sequence('#{table_name}', '#{column_name}'), #{max_value.nil? ? '1, false' : max_value})").first['setval']
-
-
4
max_value.nil? ? new_val.to_i : new_val.to_i + 1
-
end
-
-
1
def self.finalize_copy_command(connection, errmsg)
-
263
errmsg ? connection.put_copy_end(errmsg) : connection.put_copy_end
-
-
263
command_ok = false
-
263
error_message = nil
-
263
while res = connection.get_result
-
263
command_ok = true if res.result_status == PG::PGRES_COMMAND_OK
-
263
error_message = res.error_message
-
end
-
-
263
raise error_message unless command_ok
-
end
-
-
1
def self.get_new_connection(model)
-
PostgresConfiguration.get_raw_connection model.connection.raw_connection.host,
-
model.connection.raw_connection.port,
-
model.connection.raw_connection.db,
-
model.connection.raw_connection.user,
-
model.connection.raw_connection.pass
-
end
-
-
1
def self.copy_to_csv(sql, filename, raw_connection = nil)
-
dirname = File.dirname filename
-
begin
-
Dir.mkdir dirname unless File.exists? dirname
-
csv = File.open filename, 'wb'
-
rescue Errno::EEXIST
-
# open the file as normal if the dir already exists. This means another process on the same dyno
-
# created it already for this user
-
csv = File.open filename, 'wb'
-
end
-
-
conn = raw_connection || ActiveRecord::Base.connection.raw_connection
-
conn.exec("COPY (#{sql}) TO STDOUT CSV")
-
while !(data = raw_connection.get_copy_data).nil?
-
csv << data
-
end
-
-
csv.flush
-
csv.close
-
end
-
end
-
# IMPORTANT NOTE: the variables set in TCPSocket are global and affect all threads on a sidekiq instance.
-
# This allows us to share a single port for all threads on an instance which lets us perfectly distribute load across all tor clients
-
1
module ProxyUtilities
-
1
SITE_KEYS = [:amazon, :barnes_and_noble]
-
1
$throttled_sites = Hash.new(false)
-
-
1
def self.force_proxy
-
10
TCPSocket::socks_server = 'tor.vook.com'
-
10
TCPSocket::socks_port = RedisUtilities.get_available_tor_port unless TCPSocket::socks_port.present?
-
end
-
-
1
def self.using_proxy?
-
TCPSocket::socks_server.present?
-
end
-
-
1
def self.port
-
TCPSocket::socks_port
-
end
-
-
# check if dyno is throttled, if it is, and we haven't already set the proxy, set the proxy and scrape, otherwise scrape from instance IP
-
1
def self.proxy_setup(site_key)
-
72
raise InvalidArgument unless SITE_KEYS.include? site_key
-
72
sync_throttled_state site_key
-
-
72
if $throttled_sites[site_key]
-
TCPSocket::socks_server = 'tor.vook.com'
-
# get port list from redis list unless its already set for the instance
-
TCPSocket::socks_port = RedisUtilities.get_available_tor_port unless TCPSocket::socks_port.present?
-
else
-
72
TCPSocket::socks_server = nil
-
72
TCPSocket::socks_port = nil
-
end
-
end
-
-
1
private
-
-
1
def self.sync_throttled_state(site_key)
-
72
if RedisUtilities.is_dyno_throttled? != $throttled_sites[site_key]
-
p "#{site_key} mismatch between redis and instance throttled state, toggling state on dyno: #{Utilities.dyno_id}"
-
$throttled_sites[site_key] = !$throttled_sites[site_key]
-
end
-
end
-
end
-
1
module RedisUtilities
-
1
extend self
-
-
1
COUNT_KEYS = %i[amazon_statable_scrape_job_count bn_statable_scrape_job_count itunes_statable_scrape_job_count
-
5
goodreads_statable_scrape_job_count amazon_author_page_scrape_job_count].each_with_object({}.with_indifferent_access) {|key, hash| hash[key] = key.to_s}.freeze
-
1
JSON_SET_KEYS = {scraped_categories: 'top-100-categories'}.with_indifferent_access.freeze
-
1
SET_KEYS = JSON_SET_KEYS
-
1
AVAILABLE_PORT_LIST = 'portlist'.freeze
-
1
BLOCK_SIZE = 10000
-
-
# populate a redis list with a list of all the ports available (multiplied by the thread count) for connections
-
# eg. 10 clients with 10 ports each = 100 ports total and 100 workers with 5 threads a piece = 100 ports listed 5 times each
-
# make sure that you actually have enough tor clients up and running to support the worker count
-
1
def populate_port_list(worker_count)
-
2
$redis.keys('*-throttled').each {|x| $redis.del(x)}
-
2
$redis.del AVAILABLE_PORT_LIST
-
-
2
start_port = 9050
-
2
worker_count.times do |client_num|
-
400
$redis.lpush(AVAILABLE_PORT_LIST, start_port+client_num)
-
end
-
end
-
-
1
def get_available_tor_port
-
10
$redis.rpoplpush(AVAILABLE_PORT_LIST, AVAILABLE_PORT_LIST)
-
end
-
-
1
def set_dyno_throttled
-
dyno_id = Utilities.dyno_id
-
$redis.setex("#{dyno_id}-throttled", 3600, 1)
-
end
-
-
1
def is_dyno_throttled?
-
72
dyno_id = Utilities.dyno_id
-
72
$redis.exists("#{dyno_id}-throttled")
-
end
-
-
1
def clear_sleeping_and_phantom_workers(queue, max_age)
-
40
workers = get_approximate_worker_list
-
-
40
worker_msgs = $redis.pipelined do
-
40
workers.each do |worker_id|
-
$redis.get("sidekiq:worker:#{worker_id}")
-
end
-
end
-
-
# attach to each other
-
40
worker_ids_and_msgs = workers.zip(worker_msgs)
-
-
# delete all blank workers if any
-
40
$redis.del worker_ids_and_msgs.select{|worker_id, msg| worker_id if msg.blank?}.collect{|worker_id, msg| "sidekiq:worker:#{worker_id}"} if worker_ids_and_msgs.select{|worker_id, msg| worker_id if msg.blank?}.present?
-
-
# clear phantoms or sleepers
-
40
worker_ids_and_msgs.each do |worker_id, msg|
-
$redis.srem('sidekiq:workers', worker_id) if msg.blank?
-
if msg.present? && queue.include?(Sidekiq.load_json(msg)['queue'])
-
run_at = Sidekiq.load_json(msg)['run_at']
-
if Time.current.to_i - run_at > max_age
-
$redis.srem('sidekiq:workers', worker_id)
-
end
-
end
-
end
-
40
worker_ids_and_msgs.count
-
end
-
-
# Redis.scan the workers set to get an approximate list of all workers
-
1
def get_approximate_worker_list
-
40
cursor = 0
-
40
all_workers = []
-
40
while cursor != '0'
-
40
cursor, workers = $redis.sscan('sidekiq:workers', cursor.to_i)
-
40
all_workers += workers
-
end
-
40
all_workers
-
end
-
-
# Careful when using this, its VERY slow if the set is large (larger than 10k)
-
1
def worker_count(queue)
-
workers = $redis.smembers('sidekiq:workers')
-
worker_msgs = $redis.pipelined do
-
workers.each do |worker_id|
-
$redis.get("sidekiq:worker:#{worker_id}")
-
end
-
end
-
-
worker_msgs.compact.sum do |msg|
-
Sidekiq.load_json(msg)['queue'].include?(queue) ? 1 : 0
-
end
-
end
-
-
1
def get_scrape_count_key(scope)
-
19
:"#{scope}_scrape_job_count"
-
end
-
-
1
def set_count(key, number)
-
19
assert_valid_key COUNT_KEYS, key
-
-
18
$redis.set COUNT_KEYS[key], number
-
end
-
-
1
def get_count(key)
-
27
assert_valid_key COUNT_KEYS, key
-
-
26
$redis.get(COUNT_KEYS[key]).to_i
-
end
-
-
1
def add_to_set(key, values)
-
6
assert_valid_key SET_KEYS, key
-
-
5
transformed_values = JSON_SET_KEYS.keys.include?(key.to_s) ? values.collect(&:to_json) : values
-
5
$redis.sadd SET_KEYS[key], transformed_values
-
end
-
-
1
def get_set_members(key)
-
7
assert_valid_key SET_KEYS, key
-
-
6
members = $redis.smembers SET_KEYS[key]
-
18
JSON_SET_KEYS.keys.include?(key.to_s) ? members.collect {|member| JSON.parse(member)} : members
-
end
-
-
1
def clear_set(key)
-
2
assert_valid_key SET_KEYS, key
-
-
1
$redis.del SET_KEYS[key]
-
end
-
-
1
private
-
-
1
def assert_valid_key(key_hash, key)
-
65
raise ArgumentError.new("Invalid redis key, valid keys are #{key_hash.keys}") unless key_hash.stringify_keys.keys.include? key.to_s
-
end
-
end
-
1
module ReportGenerator
-
1
extend self
-
1
extend EnterpriseReports
-
-
1
def run(date, tld, report_cards: [])
-
3
warehouse_region_id = WarehouseRegion.find_by(tld: tld).id
-
3
warehouse_date_id = WarehouseDate.find_by(date: date).id
-
-
714
relation = WarehouseStat.select(WarehouseStat.column_names.collect {|column_name| WarehouseStat.arel_table[column_name]}).
-
joins(:warehouse_book_version, :warehouse_date).
-
outer_joins(:warehouse_amazon_category1, :warehouse_amazon_category2, :warehouse_amazon_category3,
-
:sub_category1_author_rank_id, :sub_category2_author_rank_id, :sub_category3_author_rank_id, :sub_category4_author_rank_id).
-
join_select('inner', false,
-
warehouse_book_version: %w[asin isbn13 bn_id status book_format pub_date title publisher sold_by pages physical_details author_name author_asin],
-
warehouse_date: %w[date]).
-
join_select('outer', false,
-
warehouse_amazon_category1: %w[category_id name category_type depth status tx_book_category_id tld parent_id],
-
warehouse_amazon_category2: %w[category_id name category_type depth status tx_book_category_id tld parent_id],
-
warehouse_amazon_category3: %w[category_id name category_type depth status tx_book_category_id tld parent_id],
-
sub_category1_author_rank_id: %w[category_id name category_type depth status tx_book_category_id tld parent_id],
-
sub_category2_author_rank_id: %w[category_id name category_type depth status tx_book_category_id tld parent_id],
-
sub_category3_author_rank_id: %w[category_id name category_type depth status tx_book_category_id tld parent_id],
-
sub_category4_author_rank_id: %w[category_id name category_type depth status tx_book_category_id tld parent_id]).
-
where(warehouse_region_id: warehouse_region_id, warehouse_date_id: warehouse_date_id).
-
order(:warehouse_book_version_id, :created_at)
-
3
row_keys = relation.select_values.collect do |attribute|
-
921
attr = (attribute.respond_to?(:left) ? attribute.left : attribute)
-
-
921
"#{attr.relation.name}_#{attr.name}"
-
end
-
924
price_column_indexes = row_keys.each_with_index.collect {|row_key, index| index if row_key.include?('_price')}.compact
-
-
3
report_cards.each do |report_card|
-
6
report_card.report_date = date
-
6
report_card.generate_row_key_indexes row_keys
-
6
report_card.open_csv
-
6
report_card.set_row_count 0
-
6
report_card.set_time_sent nil
-
6
report_card.insert_header!
-
end
-
-
3
total_row_count = 0
-
9
report_row_counts = report_cards.each_with_object({}) {|report_card, hash| hash[report_card.report_key] = 0}
-
3
update_row_counts report_row_counts, report_generator_row_count: total_row_count, report_generator_completion_time: 'Not Sent Yet'
-
6
Rails.logger.tagged('enterprise') {Rails.logger.info "----- ReportGenerator starting COPY -----"}
-
-
3
raw_connection = WarehouseStat.connection.raw_connection
-
3
raw_connection.exec("COPY (#{relation.to_sql}) TO STDOUT CSV")
-
9
while !(data = raw_connection.get_copy_data).nil?
-
3
total_row_count += 1
-
3
row = preprocess_data price_column_indexes, data
-
9
report_cards.each {|report_card| report_row_counts[report_card.report_key] += 1 if report_card.output_row!(row_keys, row)}
-
3
update_row_counts report_row_counts, report_generator_row_count: total_row_count if total_row_count % 10000 == 0
-
end
-
-
6
Rails.logger.tagged('enterprise') {Rails.logger.info "----- ReportGenerator Finished: #{total_row_count} rows -----"}
-
3
update_row_counts report_row_counts, report_generator_row_count: total_row_count, report_generator_completion_time: Time.current.to_s
-
-
3
report_cards.each do |report_card|
-
6
report_card.finalize_output!
-
6
report_card.move_to_s3
-
end
-
-
3
report_cards.each do |report_card|
-
6
report_card.deliver_report
-
end
-
-
7
ReportCards::ReportCard.send_rhinc_ftp_completion if report_cards.any? {|report_card| report_card.send_all_complete?}
-
end
-
-
1
private
-
-
1
def update_row_counts(report_row_counts, **extra_keys)
-
32
keys_and_values = report_row_counts.collect {|key, count| ["#{key}_row_count", count]}.flatten + extra_keys.collect {|key_and_count| key_and_count}.flatten
-
7
$redis.hmset 'daily_report_stats', *keys_and_values
-
end
-
-
1
def preprocess_data(price_column_indexes, data)
-
5
row = parse_data data
-
-
5
price_column_indexes.each do |index|
-
106
row[index] = ReportUtilities.as_price(row[index])
-
end
-
-
5
row
-
end
-
-
1
def parse_data(data)
-
9
",#{data}".scan /(?<=,)(?:"(?:""|[^"])*")(?=,|\n)|(?<=,)(?:[^,]*?)(?=,|\n)/
-
end
-
end
-
1
module ReportUtilities
-
1
extend self
-
-
1
def as_price(value)
-
106
return value unless value.present?
-
-
1
'%.2f' % (value.to_i / 100.0)
-
end
-
-
1
def ftp_from_s3(s3_public_link)
-
# Note file has to be public before this can run correctly
-
file = File.new("#{Rails.root}/tmp/#{File.basename(s3_public_link)}", 'wb')
-
file << open(s3_public_link).read
-
file.flush
-
-
EnterpriseReports.ftp_to_client(:rhinc, file)
-
end
-
end
-
1
module ScraperUtilities
-
1
CURRENCY_CONDITION = "(contains(text(), '$') or contains(text(), '£') or contains(text(), 'EUR'))".freeze
-
-
1
def self.get_category_id_from_url(url)
-
46
/\/digital-text\/([^\/]+)/.match(url).try(:[], 1) || /\/books\/([^\/]+)/.match(url).try(:[], 1)
-
end
-
-
1
def self.extract_asin_from_url(url)
-
354
url.scan(/.*\/(?:dp|product-reviews|e|gp\/product)\/(?:(?:(.*?)\/)|(.*$))/).flatten.compact.first
-
end
-
-
1
def self.match_url_with_asins(urls, asins)
-
16
urls.find {|url| asins.include? extract_asin_from_url(url)} if urls.present?
-
end
-
-
1
def self.cleanse_price(text)
-
# Some prices are displayed as ranges, we can set those to nil safely
-
175
return nil if text.include? '-'
-
-
174
splitchar = text.include?('EUR') ? ',' : '.'
-
-
174
if text.include?('FREE')
-
1
text = '000'
-
elsif text.split(splitchar).last.present? && text.split(splitchar).last.length < 2
-
1
text << '0'
-
elsif text.exclude?(splitchar)
-
1
text << '00'
-
end
-
-
# Remove all non numerical characters and remove whitespace
-
174
text.gsub(/\D/, '')
-
end
-
-
1
def self.cleanse_string(text)
-
# accepts only characters with valid encodings and strips out UTF8 newlines (subbed to a space) and invisible control characters (subbed to nothing)
-
1692
text.chars.select{|i| i.valid_encoding?}.join.gsub(/\p{Co}|\p{Cs}|\p{Cn}/, '').gsub(/\p{Zl}|\p{Zp}|\n/, ' ')
-
end
-
-
1
def self.parse_ean_from_bn_url(url)
-
# Maybe upgrade this later to take an entire a DOM element and decide between data-bn-rel or href like find_ean_by_first_result does?
-
114
url.include?('ean=') ? url.partition('&').first.partition('ean=').third : url.partition('?').first.split('/').last
-
-
# If the ean parameter ever moves and isn't the first param in the query string, switch to this regexp and it should be fixed
-
# /\?.*ean=(.*?)&|$/.match(url)[1]
-
end
-
-
1
def self.convert_author_rank_category_name(category_name)
-
2982
category_name.present? && category_name.starts_with?('Kindle eBooks') ? "Kindle Store > #{category_name}" : category_name
-
end
-
-
1
def self.parse_date_string(text)
-
29
Date.parse(text) if text.present?
-
rescue ArgumentError
-
4
nil
-
end
-
-
1
def self.base_category_from_category_name(category_name)
-
30
category_name.scan(/[^>]+/).first.squish
-
end
-
-
1
def self.coerce_amazon_format(format)
-
# Some stupid pages have ", .exe" and ", .doc", and other weird things in the related format book formats. Stupid.
-
11
format = format.gsub(', .exe', '').gsub(', .doc', '').gsub(', 3.5 inch diskette', '').gsub(', .wks', '').gsub(', .xml', '')
-
-
11
if format.include?('Mass Market')
-
2
'MassMarketPaperback'
-
9
elsif format.include?('Kindle')
-
3
'Kindle Edition'
-
else
-
6
format.camelize
-
end
-
end
-
-
1
def self.bn_no_results?(page)
-
12
page.blank? || (page.css('div.search-noresults-message').present? && page.css('div.search-noresults-message').text.strip.include?('Sorry, we could not find what you were looking for.'))
-
end
-
-
1
def self.force_absolute_url(url, base_url)
-
119
URI.parse(URI.encode(base_url)).merge(URI.encode(url)).to_s
-
end
-
end
-
1
module StatsScraper
-
1
extend self
-
-
1
def get_amazon_product_page_stats(page)
-
1
stats = {likes: page.scrape_likes,
-
amazon_price: page.scrape_amazon_price,
-
digital_list_price: page.scrape_digital_list_price,
-
amazon_list_price: page.scrape_amazon_list_price,
-
amazon_sales_rank: page.scrape_sales_rank,
-
amazon_sales_rank_category: page.scrape_sales_rank_category,
-
also_bought: page.scrape_also_boughts,
-
bought_after_viewing: page.scrape_bought_after_viewing,
-
frequently_bought_together: page.scrape_frequently_bought_together,
-
similar_items_by_category: page.scrape_similar_items_by_category,
-
similar_items_by_category_external_id: page.scrape_similar_items_by_category_id,
-
amazon_average_rating: page.scrape_amazon_average_rating,
-
amazon_review_count: page.scrape_amazon_review_count,
-
author_ranks: page.scrape_author_ranks,
-
amazon_availability: page.scrape_availability,
-
kindle_unlimited: page.scrape_kindle_unlimited}
-
-
1
subcategory_stats = {}
-
page.scrape_sub_categories_and_ranks.each_with_index do |sub_category_and_rank, i|
-
2
if sub_category_and_rank.present?
-
2
subcategory_stats[:"sub_category#{i + 1}_rank"] = sub_category_and_rank[:rank]
-
2
subcategory_stats[:"sub_category#{i + 1}_tree"] = sub_category_and_rank[:category]
-
2
subcategory_stats[:"sub_category#{i + 1}_id"] = sub_category_and_rank[:category_id]
-
end
-
1
end if page.scrape_sub_categories_and_ranks.present?
-
1
stats.merge! subcategory_stats
-
-
1
stats.merge! page.scrape_related_format_data if page.scrape_related_format_data.present?
-
1
stats.merge! page.scrape_star_rating_distribution if page.scrape_star_rating_distribution.present?
-
-
1
stats
-
end
-
-
1
def get_amazon_author_page_stats(author_page)
-
2
also_bought_bys = author_page.scrape_also_bought_items_by
-
2
if also_bought_bys.present?
-
1
also_bought_bys = also_bought_bys.each_with_index.each_with_object({}) do |author_and_index, hash|
-
16
author, index = author_and_index
-
16
hash[:"amazon_also_bought_items_by_#{index + 1}"] = author
-
end
-
end
-
-
2
also_bought_bys || {}
-
end
-
-
# TODO refactor this, scrape_cheapest_print_list_price needs to change and then this will too so we can handle captchas
-
1
def get_amazon_lowest_print_list_price(page, book_format)
-
1
big_six_publishers = ['Hachette Book Group', 'HarperCollins Publishers', 'HarperCollins Publishing', 'Macmillan',
-
'Penguin Publishing', 'Random House Digital, Inc.', 'Random House Mondadori',
-
'Simon and Schuster Digital Sales Inc']
-
1
sold_by = page.scrape_sold_by
-
1
if sold_by.present? && big_six_publishers.include?(sold_by) && book_format.present? && book_format.include?('Kindle')
-
1
lowest = page.scrape_cheapest_print_list_price
-
1
lowest[:print_list_price] if lowest.present?
-
end
-
end
-
-
-
1
def get_amazon_competitive_stats(page)
-
1
stats = {amazon_price: page.scrape_amazon_price,
-
digital_list_price: page.scrape_digital_list_price,
-
amazon_list_price: page.scrape_amazon_list_price,
-
amazon_sales_rank: page.scrape_sales_rank,
-
publisher: page.scrape_publisher,
-
pub_date: page.scrape_pub_date.to_s,
-
physical_details: page.physical_details,
-
language: page.scrape_language,
-
isbn13: page.scrape_isbn_13,
-
title: page.scrape_title,
-
author: page.scrape_author
-
}
-
-
1
subcategory_stats = {}
-
page.scrape_sub_categories_and_ranks.each_with_index do |sub_category_and_rank, i|
-
2
if sub_category_and_rank.present?
-
2
subcategory_stats[:"sub_category#{i + 1}_rank"] = sub_category_and_rank[:rank]
-
2
subcategory_stats[:"sub_category#{i + 1}_tree"] = sub_category_and_rank[:category]
-
2
subcategory_stats[:"sub_category#{i + 1}_id"] = sub_category_and_rank[:category_id]
-
end
-
1
end if page.scrape_sub_categories_and_ranks.present?
-
1
stats.merge! subcategory_stats
-
-
1
stats
-
end
-
-
1
def get_stats_for_ean(page)
-
1
stats = {bn_price: page.scrape_price,
-
bn_nook_price: page.scrape_nook_price,
-
bn_nook_list_price: page.scrape_nook_list_price,
-
bn_list_price: page.scrape_list_price,
-
barnes_sales_rank: page.scrape_sales_rank,
-
bn_also_bought: page.scrape_also_boughts,
-
barnes_average_rating: page.scrape_average_rating,
-
barnes_rating_count: page.scrape_rating_count,
-
barnes_review_count: page.scrape_review_count}
-
-
1
stats.merge!(page.scrape_related_format_data || {})
-
end
-
-
1
def get_stats_for_itunes(itunes_id, tld)
-
2
tld == '.com' ? ItunesApi.get_itunes_data(itunes_id) : ItunesApi.get_regional_itunes_data(itunes_id)
-
end
-
-
1
def get_stats_for_goodreads(goodreads_url, key, tld)
-
3
if tld == '.com'
-
2
goodreads_page = goodreads_url.present? ? GoodreadsBookPage.new(goodreads_url) : GoodreadsBookPage.by_key(key)
-
2
return {} unless goodreads_page.ok?
-
-
2
goodreads_page.rating_details || {}
-
else
-
1
{}
-
end
-
end
-
end
-
1
module Utilities
-
1
TLDS = %w[.com .co.uk].freeze
-
-
1
def self.env
-
212
(defined?(Rails).present? ? Rails.env : ENV['RAILS_ENV']) || 'development'
-
end
-
-
1
def self.pad_customer_behavior_data(customer_behavior_data, max_size)
-
1497
column_values = Array.new(max_size)
-
1497
if customer_behavior_data.present?
-
1496
customer_behavior_data.each_with_index do |data, index|
-
11957
yield(column_values, data, index)
-
end
-
end
-
-
1497
column_values.first(max_size)
-
end
-
-
1
def self.prepare_string_for_copy(string)
-
28263
return string unless string.is_a? String
-
28247
begin
-
28247
truncated_string = string.gsub(/\p{Co}|\p{Cs}|\p{Cn}/, '').gsub("\\", "\\\\\\").gsub(/\\xEC/i, "\\\\\\xEC").gsub(/\\xEE/i, "\\\\\\xEE").gsub("\"", "\\\"").gsub(/\p{Zl}|\p{Zp}|\n/, ' ')[0..254]
-
rescue ArgumentError
-
1
truncated_string = string.encode('UTF-16', 'UTF-8', invalid: :replace, replace: '').encode('UTF-8', 'UTF-16').gsub(/\p{Co}|\p{Cs}|\p{Cn}/, '').gsub("\\", "\\\\\\").gsub(/\\xEC/i, "\\\\\\xEC").gsub(/\\xEE/i, "\\\\\\xEE").gsub("\"", "\\\"").gsub("\n", "\\n")[0..254]
-
end
-
-
28247
while truncated_string.end_with?("\\")
-
1
truncated_string.chop!
-
end
-
-
28247
"\"#{truncated_string}\""
-
end
-
-
1
def self.prepare_integer_for_copy(integer)
-
22705
return integer.presence if integer.blank? || integer.is_a?(Integer)
-
-
22703
integer.gsub(/\(|\)|,/, '').to_i
-
end
-
-
1
def self.prepare_float_for_copy(float)
-
4239
return float.presence if float.blank? || float.is_a?(Float)
-
-
4237
float.gsub(/\(|\)|,/, '').to_f
-
end
-
-
1
def self.ignore_bad_price_for_copy(string)
-
2495
string.present? && !(string.include?('-') || string.include?('/') || string.downcase.include?('click to see') || string.to_i > 2147483647) ? string : nil
-
end
-
-
1
def self.check_flag_name(flag_name)
-
6
raise 'Invalid Flag' unless %w[historic_etl ensure_one_historic_etl ingestions conditional_log].include? flag_name
-
end
-
-
1
def self.set_flag(flag_name)
-
2
check_flag_name flag_name
-
2
$redis.set "utilities:flags:#{flag_name}", 1
-
end
-
-
1
def self.unset_flag(flag_name)
-
2
check_flag_name flag_name
-
2
$redis.set "utilities:flags:#{flag_name}", 0
-
end
-
-
1
def self.is_flag_set?(flag_name)
-
3
check_flag_name flag_name
-
3
$redis.get("utilities:flags:#{flag_name}") == '1'
-
end
-
-
1
def self.method_missing(meth, *args, &block)
-
4
if meth.to_s =~ /^enable_(.+)$/
-
1
set_flag $1
-
3
elsif meth.to_s =~ /^disable_(.+)$/
-
1
unset_flag $1
-
2
elsif meth.to_s =~ /^(.+)_enabled\?$/
-
1
is_flag_set? $1
-
else
-
1
super
-
end
-
end
-
-
1
def self.is_author_asin?(asin)
-
# This is based on the assumption that all author asins are in this format "BXXXXXXXXX" where X can be any character or number
-
6
asin.present? && asin.length == 10 && asin[0] == 'B' && asin.scan(/[[:alnum:]]/).length == asin.length
-
end
-
-
1
def self.upload_page_to_s3(page, name)
-
1
html = page.is_a?(Page) ? page.dom : page
-
1
file = File.new("/tmp/amazon-#{Time.now.to_s.parameterize}#{name.parameterize}.html", 'w')
-
1
file.syswrite(html)
-
1
uploader = HtmlPageUploader.new
-
1
uploader.store! file
-
end
-
-
1
def self.job_class_on_queue?(job_klasses)
-
24
Array.wrap(job_klasses).any? do |job_klass|
-
59
Sidekiq::Queue.new(job_klass.to_s.constantize.sidekiq_options_hash['queue']).any? {|job| job.klass == job_klass.to_s}
-
end
-
end
-
-
1
def self.job_class_in_retry_set?(job_klasses, retry_count_threshold = -1)
-
52
Sidekiq::RetrySet.new.any? {|job| Array.wrap(job_klasses).any? {|job_klass| job_klass.to_s == job.klass} && job['retry_count'] > retry_count_threshold}
-
end
-
-
1
def self.job_class_being_processed?(job_klasses)
-
Sidekiq.redis do |conn|
-
conn.smembers('workers').map do |w|
-
4
msg = conn.get("worker:#{w}")
-
4
msg ? Sidekiq.load_json(msg)['payload']['class'] : nil
-
23
end.compact.uniq
-
31
end.any? {|klass| Array.wrap(job_klasses).any? {|job_klass| job_klass.to_s == klass}}
-
end
-
-
1
def self.class_in_sidekiq?(job_klasses)
-
24
job_class_on_queue?(job_klasses) || job_class_in_retry_set?(job_klasses) || job_class_being_processed?(job_klasses)
-
end
-
-
1
def self.get_depth_from_category_name(name)
-
6
name.count '>'
-
end
-
-
1
def self.determine_key_type(key)
-
20
if key.nil?
-
nil
-
12
elsif key.starts_with?('294') && key.length == 13
-
1
:bn_id
-
11
elsif ISBN_Tools.is_valid_isbn10?(key)
-
1
:isbn10
-
10
elsif ISBN_Tools.is_valid_isbn13?(key)
-
4
:isbn13
-
6
elsif key.length == 10
-
3
:asin
-
end
-
end
-
-
1
def self.is_isbn?(key)
-
7
[:isbn13, :isbn10].include? determine_key_type(key)
-
end
-
-
1
def self.get_url_hints_from_metadata(metadatas)
-
metadatas.collect do |metadata|
-
10
asin = metadata.present? ? metadata[0].presence : nil
-
10
asin.present? && asin.length == 10 ? asin : nil
-
19
end.compact
-
end
-
-
1
def self.bn_format_code(book_format)
-
10
case book_format
-
when 'Hardcover'
-
1
'1519'
-
when 'Paperback'
-
1
'1521'
-
when 'NOOK Book'
-
5
'2734'
-
else
-
nil
-
end
-
end
-
-
1
def self.log(tag, text)
-
1
p "[#{tag}] - #{text}" if env == 'production' || env == 'staging'
-
end
-
-
1
def self.process_type
-
ENV['DYNO'].split('.').first if Utilities.env == 'production' || Utilities.env == 'staging'
-
end
-
-
1
def self.dyno_id
-
72
ENV['DYNO'].split('.').last if Utilities.env == 'production' || Utilities.env == 'staging'
-
end
-
end
-
1
module Validations
-
1
def self.amazon_not_found_in_search?(search_page)
-
2
search_page.scrape_search_results_urls.blank?
-
end
-
-
1
def self.amazon_ambiguous_search_results?(search_page, url_hints)
-
4
search_result_urls = search_page.scrape_search_results_urls
-
4
search_result_urls.present? && search_result_urls.count > 1 && ScraperUtilities.match_url_with_asins(search_result_urls, url_hints).blank?
-
end
-
-
1
def self.amazon_no_price?(book_page)
-
2
book_page.scrape_amazon_price.blank?
-
end
-
-
1
def self.amazon_no_image?(book_page)
-
2
!book_page.book_image_exists?
-
end
-
-
1
def self.amazon_no_buy_button?(book_page)
-
2
!book_page.buy_button_exists?
-
end
-
-
1
def self.no_isbn?(isbn_or_asin)
-
2
!Utilities.is_isbn?(isbn_or_asin)
-
end
-
-
1
def self.bn_not_found_in_search?(page)
-
2
page.no_results?
-
end
-
-
1
def self.apple_invalid?(isbn_or_asin)
-
2
ItunesApi.get_metadata_by_isbn13(isbn_or_asin) == {}
-
end
-
end
-
1
class BaseSerializer
-
1
def initialize(object)
-
9
@object = object
-
9
@valid_keys = []
-
end
-
-
1
def as_json(options = {})
-
8
if @object.respond_to?(:to_ary)
-
3
@object.collect {|obj| select_keys obj}
-
else
-
7
select_keys @object
-
end
-
end
-
-
1
private
-
-
1
def select_keys(object)
-
150
@valid_keys.each_with_object(HashWithIndifferentAccess.new) {|key, hash| hash[key] = object.send key}
-
end
-
end
-
1
class BookVersionCategorySerializer < BaseSerializer
-
1
def initialize(object)
-
1
super
-
1
@valid_keys = %w[id category_name warehouse_book_version_id]
-
end
-
end
-
1
class BookVersionSerializer < BaseSerializer
-
1
def initialize(object)
-
1
super
-
1
@valid_keys = %w[id asin bn_id book_format isbn13 pub_date status title publisher sold_by pages physical_details author_name author_asin itunes_id tld duplicate_key source itunes_pub_date itunes_genres canonical_amazon_url canonical_bn_url canonical_goodreads_url amazon_book_description]
-
end
-
end
-
1
class CategorySerializer < BaseSerializer
-
1
def initialize(object)
-
1
super
-
1
@valid_keys = %w[id category_id depth name category_type status tld parent_id canonical_category_id]
-
end
-
end
-
1
class CategoryStatSerializer < BaseSerializer
-
1
def initialize(object)
-
1
super
-
1
@valid_keys = %w[id best_rank worst_rank book_version_count mean_rank median_rank category_name best_rank_book_version_id
-
worst_rank_book_version_id warehouse_region_id warehouse_date_id warehouse_category_id date tld]
-
end
-
end
-
1
class ListStatSerializer < BaseSerializer
-
1
def initialize(object)
-
1
super
-
1
@valid_keys = %w[id warehouse_book_version_id warehouse_category_id warehouse_date_id warehouse_trend_id days_in_top_100 name rank price author title asin isbn bn_id itunes_id list_type]
-
end
-
end
-
1
class ProductStatSerializer < BaseSerializer
-
1
def initialize(object)
-
1
super
-
1
@valid_keys = WarehouseStat::WAREHOUSE_STAT_FIELDS +
-
%w[id date tld asin isbn13 amazon_similar_item_category_names amazon_similar_item_category_external_ids]
-
end
-
end
-
1
module AmazonApiWorkers
-
1
class GetManyApiResponses
-
1
BATCH_SIZE = 10
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :ingest_low
-
-
1
def perform(book_version_ids, key_type, tld)
-
2
ActiveRecord::Base.connection.uncached do
-
2
ActiveRecord::Base.logger.level = Logger::FATAL
-
2
warehouse_book_versions = WarehouseBookVersion.where(id: book_version_ids, tld: tld)
-
2
if warehouse_book_versions.present?
-
2
items_by_key = AmazonApi.get_all_items_by_keys_and_tld warehouse_book_versions.collect(&:"#{key_type}"), key_type, tld
-
-
2
warehouse_book_versions.each do |warehouse_book_version|
-
4
warehouse_book_version.update_amazon_api_response items: items_by_key[warehouse_book_version.send key_type]
-
4
BookVersionWorkers::Ingest.perform_async warehouse_book_version.id if warehouse_book_version.status == :ready_for_amazon_ingestion
-
end
-
end
-
end
-
end
-
end
-
end
-
1
module BackupWorkers
-
1
class NightlyMongo
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :sync
-
-
1
def perform(date_string = Date.current.to_s)
-
4
Rails.logger.tagged('backup') {Rails.logger.info "Creating Mongo Backup at #{Time.current}"}
-
2
date = date_string.to_date
-
-
# Drop all collections from 7 days ago + 7 days ago author_page_data
-
18
collection_types.collect {|collection_type| MongoUtilities.daily_collection(collection_type, date - 7.days).drop}
-
4
Rails.logger.tagged('backup') {Rails.logger.info "Mongo old collections dropped at #{Time.current}"}
-
-
# Backup all daily stats collections + list stat data + exceptions (can add author page, de_competitive collections later)
-
# flatten all the lists of lists together into a single collection list
-
18
collections = collection_types.collect {|collection_type| MongoUtilities.daily_collection(collection_type, date)}
-
-
# run backup on collections
-
2
HerokuMongoBackup::Backup.new(date).backup(collections)
-
4
Rails.logger.tagged('backup') {Rails.logger.info "Mongo Backup completed at #{Time.current}"}
-
end
-
-
1
def collection_types
-
4
MongoUtilities::DAILY_COLLECTION_TYPES
-
end
-
end
-
-
1
class RestoreMongoBackup
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :superhighmem
-
-
1
def perform(file_location)
-
Rails.logger.tagged('backup') {Rails.logger.info "Restoring Mongo Backup #{file_location} at #{Time.current}"}
-
-
# Restore backups by passing in a file_location from S3, remember to make sure file is public before doing this
-
HerokuMongoBackup.load_from_file(file_location)
-
Rails.logger.tagged('backup') {Rails.logger.info "Mongo Restore completed at #{Time.current}"}
-
end
-
end
-
-
1
class RestoreAndBackfillCategoryStats
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :sync
-
1
KEY = 'category_stats_restore_stuff'
-
1
STOP_KEY = 'dude_stop_it'
-
1
ETL_COUNT = 'restore_stuff_etl_count'
-
-
1
def perform
-
marker = $redis.get(KEY)
-
return unless marker.present?
-
-
warehouse_region_com_id = WarehouseRegion.com.id
-
warehouse_region_co_uk_id = WarehouseRegion.couk.id
-
-
date_string, stage = marker.split('|')
-
date = date_string.to_date
-
while date.present? && !$redis.get(STOP_KEY)
-
case stage
-
when 'date-start'
-
BackupWorkers::RestoreMongoBackup.perform_async s3_url_from_date(date_string)
-
$redis.set(KEY, "#{date_string}|restore-started")
-
sleep 30 while Utilities.class_in_sidekiq?(BackupWorkers::RestoreMongoBackup)
-
$redis.set(KEY, "#{date_string}|restore-finished")
-
when 'restore-started'
-
if Utilities.class_in_sidekiq?(BackupWorkers::RestoreMongoBackup)
-
sleep 30 while Utilities.class_in_sidekiq?(BackupWorkers::RestoreMongoBackup)
-
$redis.set(KEY, "#{date_string}|restore-finished")
-
else
-
cleanup_restored_data
-
$redis.set(KEY, "#{date_string}|date-started")
-
end
-
when 'restore-finished'
-
warehouse_date_id = WarehouseDate.where(date: date).first_or_create.id
-
warehouse_date_id, etl_product_stats, etl_book_version_categories, etl_category_stats = warehouse_date_id, false, false, true
-
-
collection = $mongodb.collection "stats_#{date_string.gsub('-', '')}-restored"
-
-
ids = WarehouseBookVersion.connection.execute("SELECT t.id FROM (SELECT id::varchar(255), row_number() OVER(ORDER BY id::varchar(255) ASC) AS row_asc FROM warehouse_book_versions where status = 'ingested') t WHERE t.row_asc % #{EtlWorkers::NightlyStatsEtl::BATCH_SIZE} = 0 OR t.row_asc % #{EtlWorkers::NightlyStatsEtl::BATCH_SIZE} = #{EtlWorkers::NightlyStatsEtl::BATCH_SIZE - 1} OR t.row_asc = 1").values.flatten
-
ids += WarehouseBookVersion.connection.execute("SELECT id::varchar(255) FROM warehouse_book_versions where status = 'ingested' order by id::varchar(255) DESC limit 1").values.flatten
-
params = ids.uniq.each_slice(2).collect do |slice|
-
[collection.name, slice.first, slice.last, warehouse_region_com_id, warehouse_region_co_uk_id, warehouse_date_id, date_string, etl_product_stats, etl_book_version_categories, etl_category_stats]
-
end.compact
-
-
$redis.set(ETL_COUNT, params.count)
-
Sidekiq::Client.push_bulk 'class' => EtlWorkers::NightlyStatsEtl, 'args' => params if params.present?
-
$redis.set(KEY, "#{date_string}|etl-queued")
-
sleep 30 while Utilities.class_in_sidekiq?(EtlWorkers::NightlyStatsEtl)
-
if CategoryStatsCollection.new(date).find.count == $redis.get(ETL_COUNT).to_i
-
$redis.set(KEY, "#{date_string}|etl-complete")
-
else
-
CategoryStatsCollection.new(date).drop
-
$redis.set(KEY, "#{date_string}|restore-finished")
-
end
-
when 'etl-queued'
-
sleep 30 while Utilities.class_in_sidekiq?(EtlWorkers::NightlyStatsEtl)
-
if CategoryStatsCollection.new(date).find.count == $redis.get(ETL_COUNT).to_i
-
$redis.set(KEY, "#{date_string}|etl-complete")
-
else
-
CategoryStatsCollection.new(date).drop
-
$redis.set(KEY, "#{date_string}|restore-finished")
-
end
-
when 'etl-complete'
-
EtlWorkers::CategoryStatsEtl.perform_async date_string
-
$redis.set(KEY, "#{date_string}|category-stats-etl-started")
-
sleep 30 while Utilities.class_in_sidekiq?(EtlWorkers::CategoryStatsEtl)
-
$redis.set(KEY, "#{date_string}|category-stats-etl-completed")
-
when 'category-stats-etl-started'
-
sleep 30 while Utilities.class_in_sidekiq?(EtlWorkers::CategoryStatsEtl)
-
$redis.set(KEY, "#{date_string}|category-stats-etl-completed")
-
when 'category-stats-etl-completed'
-
cleanup_restored_data
-
$redis.set(KEY, "#{date_string}|restore-cleaned")
-
when 'restore-cleaned'
-
'2014-07-24'.to_date > date ? $redis.del(KEY) : $redis.set(KEY, "#{(date - 1.day).to_s}|date-start")
-
end
-
-
marker = $redis.get(KEY)
-
date_string, stage = marker.split('|')
-
date = date_string.to_date
-
end
-
end
-
-
1
def s3_url_from_date(date_string)
-
case date_string
-
when '2014-07-24'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-24--2014-07-24%7C08%3A35%3A54.gz'
-
when '2014-07-25'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-25--2014-07-25%7C08%3A40%3A23.gz'
-
when '2014-07-26'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-26--2014-07-26%7C08%3A13%3A30.gz'
-
when '2014-07-27'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-27--2014-07-27%7C08%3A18%3A35.gz'
-
when '2014-07-28'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-28--2014-07-28%7C08%3A35%3A00.gz'
-
when '2014-07-29'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-29--2014-07-29%7C08%3A45%3A53.gz'
-
when '2014-07-30'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-30--2014-07-30%7C08%3A41%3A32.gz'
-
when '2014-07-31'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-07-31--2014-07-31%7C08%3A56%3A40.gz'
-
when '2014-08-01'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-08-01--2014-08-01%7C09%3A49%3A21.gz'
-
when '2014-08-02'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-08-02--2014-08-02%7C12%3A22%3A31.gz'
-
when '2014-08-03'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-08-03--2014-08-03%7C09%3A10%3A47.gz'
-
when '2014-08-04'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-08-04--2014-08-04%7C14%3A13%3A11.gz'
-
when '2014-08-05'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-08-05--2014-08-05%7C16%3A29%3A29.gz'
-
when '2014-08-06'
-
'https://s3.amazonaws.com/booklr-production/backups/2014-08-06--2014-08-06%7C14%3A49%3A56.gz'
-
else
-
nil
-
end
-
end
-
-
1
def cleanup_restored_data
-
$mongodb.collection_names.select {|x| x.ends_with? '-restored'}.each do |name|
-
$mongodb.collection(name).drop
-
end
-
end
-
end
-
end
-
1
module BigDataReports
-
1
class QueueReportBatch
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_id)
-
batch_document = ReportBatch.find(report_id)
-
batch_id = BSON::ObjectId.from_string(report_id)
-
batch_parameters = batch_document.batch_params
-
-
isbn_warehouse_book_versions = WarehouseBookVersion.ingested.com.where(isbn13: batch_parameters['isbn13s']).joins(:warehouse_stats).where{warehouse_stats.warehouse_date_id == batch_parameters['warehouse_date_id']}
-
asin_warehouse_book_versions = WarehouseBookVersion.ingested.com.where(asin: batch_parameters['asins']).joins(:warehouse_stats).where{warehouse_stats.warehouse_date_id == batch_parameters['warehouse_date_id']}
-
-
invalid_isbn13s = batch_parameters['isbn13s'] - isbn_warehouse_book_versions.collect(&:isbn13)
-
invalid_asins = batch_parameters['asins'] - asin_warehouse_book_versions.collect(&:asin)
-
invalid_book_versions = WarehouseBookVersion.com.where{(asin.in invalid_asins) | (isbn13.in invalid_isbn13s)}
-
klass = case batch_document.job_type
-
when 'BigDataReports::Top100ProjectedRankReportWorker'
-
Top100ProjectedRankReport
-
when 'BigDataReports::SimilarBookPricingReportWorker'
-
SimilarBookPricingReport
-
else
-
[]
-
end
-
(invalid_asins - invalid_book_versions.collect(&:asin)).each do |missing_asin|
-
klass.create job_type: batch_document.job_type,
-
asin: missing_asin,
-
warehouse_date_id: batch_parameters['warehouse_date_id'],
-
batch_ids: [batch_id],
-
status: :unable_to_process,
-
error: "ASIN #{missing_asin} is not currently in our system"
-
end
-
(invalid_isbn13s - invalid_book_versions.collect(&:isbn13)).each do |missing_isbn13|
-
klass.create job_type: batch_document.job_type,
-
isbn13: missing_isbn13,
-
warehouse_date_id: batch_parameters['warehouse_date_id'],
-
batch_ids: [batch_id],
-
status: :unable_to_process,
-
error: "ISBN13 #{missing_isbn13} is not currently in our system"
-
end
-
invalid_book_versions.each do |invalid_book_version|
-
error_msg = case
-
when !invalid_book_version.ingested?
-
"Book version is in invalid state (#{invalid_book_version.status})"
-
when invalid_book_version.warehouse_stats.blank?
-
'Book version is valid but has no stats, it was most likely ingested today. If this problem persists tomorrow, contact your friendly neighborhood support.'
-
when invalid_book_version.warehouse_stats.where{warehouse_stats.warehouse_date_id == batch_parameters['warehouse_date_id']}.blank?
-
'Book version is valid but has no data for today, let your big data helpers know so they can investigate'
-
end
-
klass.create job_type: batch_document.job_type,
-
asin: invalid_book_version.asin,
-
isbn13: invalid_book_version.isbn13,
-
warehouse_date_id: batch_parameters['warehouse_date_id'],
-
batch_ids: [batch_id],
-
status: :unable_to_process,
-
error: error_msg
-
end
-
-
report_ids = []
-
all_book_versions = isbn_warehouse_book_versions + asin_warehouse_book_versions
-
all_book_versions.each do |book_version|
-
report = klass.create job_type: batch_document.job_type,
-
asin: book_version.asin,
-
isbn13: book_version.isbn13,
-
warehouse_date_id: batch_parameters['warehouse_date_id'],
-
batch_ids: [batch_id],
-
status: :processing
-
report_ids << report.id
-
end
-
job_params = report_ids.collect(&method(:Array))
-
Report.start_batch(report_ids, batch_id)
-
Sidekiq::Client.push_bulk('class' => batch_document.job_type.constantize, 'args' => job_params)
-
-
batch_document.destroy
-
end
-
end
-
-
1
class Top100ProjectedRankReportWorker
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_id)
-
report = Top100ProjectedRankReport.find(report_id)
-
report.generate
-
end
-
end
-
-
1
class Top100PriceDistributionReportWorker
-
1
include Sidekiq::Worker
-
1
include ReportUtilities
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(date_string)
-
warehouse_date_id = WarehouseDate.find_by(date: date_string.to_date).id
-
-
report = Top100PriceDistributionReport.create warehouse_date_id: warehouse_date_id,
-
status: :processing
-
report.generate
-
end
-
end
-
-
1
class SimilarBookPricingReportWorker
-
1
include Sidekiq::Worker
-
1
include ReportUtilities
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_id)
-
report = SimilarBookPricingReport.find(report_id)
-
report.generate
-
end
-
end
-
-
1
class NewIdentification
-
1
include Sidekiq::Worker
-
1
include ReportUtilities
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string, amazon_average_rating, amazon_review_count, days_of_data_min, min_page_count, client_name, category_names)
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{client_name.to_s.camelcase} Identification Report -----"}
-
-
client_name = :booklr
-
client_config = AmazeBot.config[:reports][:clients][client_name]
-
report_date = report_date_string.to_date
-
-
minimum_creation_date = (report_date - days_of_data_min.days).in_time_zone.to_s
-
date_range = (report_date - 89.days)..report_date
-
warehouse_dates = WarehouseDate.where(date: date_range).order(date: :asc)
-
current_warehouse_date = warehouse_dates.last
-
warehouse_date_ids = warehouse_dates.collect(&:id)
-
-
warehouse_region_id = WarehouseRegion.com.id
-
warehouse_categories = WarehouseCategory.amazon.com
-
-
book_versions = identify_book_versions current_warehouse_date.id,
-
category_names: category_names,
-
minimum_creation_date: minimum_creation_date,
-
minimum_amazon_average_rating: amazon_average_rating,
-
minimum_amazon_review_count: amazon_review_count,
-
min_page_count: min_page_count
-
stats = current_warehouse_date.warehouse_stats.where(warehouse_book_version_id: book_versions.collect(&:id).uniq)
-
rank_projections_by_book_version_id = WarehouseStat.top100_rank_projections current_warehouse_date, stats, false
-
filtered_book_versions = book_versions.select {|book_version| rank_projections_by_book_version_id[book_version.id].any? {|_, projection_details| !projection_details[:currently_ranked] && projection_details[:position] <= 100}}
-
filtered_book_version_ids = filtered_book_versions.collect(&:id)
-
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{client_name.to_s.camelcase} Identification Complete, report generation starting -----"}
-
-
report_hash = EnterpriseReports.generate_report_hash("#{client_name}-new-identification-report-#{report_date.strftime('%m%d%y')}", client_name)
-
report_csv = EnterpriseReports.open_csv(report_hash)
-
-
# Append header row
-
header = ['Title', 'Author', 'ASIN', 'ISBN', 'Pub Date', 'Publisher', 'Sold By', 'Page Count', 'Days of Data',
-
'Created Date', 'Total Days Since Creation', '90 Day Average Overall Rank', '30 Day Moving Average',
-
'7 Day Moving Average', 'Trendline Growth %', 'R-Squared', '90 Day Overall Rank Growth Rate',
-
'90 Day Overall Rank Volatility', 'Apple Number of Ratings', 'BN number of Ratings', 'Amazon Number of Likes',
-
'Amazon Number of Ratings', 'Ratings Per Day Since Published', 'Reviews Per Day Over Last 90 Days',
-
'Average Star Rating', '% of Ratings 4 or above', '#1 Similar Category', '#2 Similar Category', '#3 Similar Category',
-
'#1 Sub Category', '#1 Sub Category Percentage', '#2 Sub Category', '#2 Sub Category Percentage', '#3 Sub Category',
-
'#3 Sub Category Percentage', 'Current Sales Rank', 'Current Price', 'Product URL', 'Amazon Description',
-
'Projected Top 100 Category Name and Rank']
-
report_csv << header
-
-
block_size = 20
-
(filtered_book_version_ids.count / block_size + 1).times do |block_count|
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Generating #{client_name.to_s.camelcase} report for #{(block_size * block_count)..(block_size * (block_count + 1) - 1)} out of #{filtered_book_version_ids.count} book versions at #{Time.current}-----"}
-
-
WarehouseStat.select(WarehouseStat::WAREHOUSE_STAT_FIELDS + WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS).
-
where(warehouse_book_version_id: filtered_book_version_ids[(block_size * block_count)..(block_size * (block_count + 1) - 1)]).
-
where(warehouse_date_id: warehouse_date_ids, warehouse_region_id: warehouse_region_id).
-
order(:warehouse_book_version_id, :warehouse_date_id).
-
chunk{|el| el.warehouse_book_version_id}.each do |warehouse_book_version_id, warehouse_stats|
-
-
book_version = filtered_book_versions.select {|warehouse_book_version| warehouse_book_version.id == warehouse_book_version_id}.first
-
if warehouse_stats.collect(&:amazon_sales_rank).compact.blank?
-
Rails.logger.tagged('enterprise') {Rails.logger.info "skipping warehouse_book_version #{book_version.id}, no sales rank data"}
-
next
-
elsif warehouse_stats.count < days_of_data_min
-
Rails.logger.tagged('enterprise') {Rails.logger.info "skipping warehouse_book_version #{book_version.id}, less than #{days_of_data_min} days of data"}
-
next
-
end
-
-
oldest_stat, newest_stat = warehouse_stats.first, warehouse_stats.last
-
-
first_stat_date = warehouse_dates.select{|warehouse_date| warehouse_date.id == oldest_stat.warehouse_date_id}.first.date
-
last_stat_date = warehouse_dates.select{|warehouse_date| warehouse_date.id == newest_stat.warehouse_date_id}.first.date
-
row = [book_version.title, book_version.author_name, book_version.asin, book_version.isbn13, book_version.pub_date,
-
book_version.publisher, book_version.sold_by, book_version.pages, warehouse_stats.count, book_version.created_at.to_date.to_s,
-
(Date.current - book_version.created_at.to_date).to_i]
-
-
# Regression Setup
-
amazon_sales_ranks = warehouse_stats.collect(&:amazon_sales_rank).compact
-
line_fit = LineFit.new
-
valid = line_fit.setData((1..amazon_sales_ranks.count).to_a, amazon_sales_ranks)
-
-
row += [amazon_sales_ranks.mean, amazon_sales_ranks.moving_average(30).last, amazon_sales_ranks.moving_average(7).last]
-
row << (valid ? "#{((line_fit.forecast(1) - line_fit.forecast(amazon_sales_ranks.count)) / (line_fit.forecast(1).abs) * 100).round(2)}%" : nil)
-
row << (valid ? line_fit.rSquared.round(3) : nil)
-
row << Formulas.average_growth_rate(oldest_stat.amazon_sales_rank, newest_stat.amazon_sales_rank, last_stat_date - first_stat_date, :negative)
-
-
row += [amazon_sales_ranks.standard_deviation, newest_stat.itunes_rating_count, newest_stat.bn_review_count,
-
newest_stat.amazon_likes, newest_stat.amazon_review_count]
-
row << (newest_stat.amazon_review_count.present? && book_version.pub_date.present? ? newest_stat.amazon_review_count / (report_date - book_version.pub_date.to_date).to_f : nil)
-
row << (oldest_stat.amazon_review_count.present? && newest_stat.amazon_review_count.present? ? (newest_stat.amazon_review_count - oldest_stat.amazon_review_count) / warehouse_stats.count.to_f : nil)
-
-
row << newest_stat.amazon_average_rating
-
if newest_stat.amazon_review_count.present? && (newest_stat.five_star_count.present? || newest_stat.four_star_count.present?)
-
top_count = (newest_stat.five_star_count || 0) + (newest_stat.four_star_count || 0)
-
row << ((top_count.to_f / newest_stat.amazon_review_count) * 100).to_s + '%'
-
else
-
row << '0%'
-
end
-
-
# Top Similar Item Categories
-
top_similar_item_categories = warehouse_stats.collect {|warehouse_stat| warehouse_stat.amazon_similar_item_category_names}.flatten.mode.reverse.first(3)
-
row += EnterpriseReports.pad_serialized_data(top_similar_item_categories, 3) {|value| value}
-
-
# Top Sub Categories
-
top_subcategory_id_frequencies = warehouse_stats.collect {|warehouse_stat| (1..3).collect {|num| warehouse_stat.send "warehouse_amazon_category#{num}_id"}}.flatten.compact.frequencies.to_a.reverse.first(3)
-
category_names_and_percentages = top_subcategory_id_frequencies.map {|warehouse_category_id, count| [warehouse_categories.find {|category| category.id == warehouse_category_id}.name, "#{count.to_f / warehouse_stats.count * 100}%"]}
-
row += EnterpriseReports.pad_serialized_data(category_names_and_percentages, 6) {|values| values.flatten}
-
-
row += [newest_stat.amazon_sales_rank, (newest_stat.amazon_price / 100.0 if newest_stat.amazon_price.present?), Urls.amazon_book_page(book_version.asin, '.com'), book_version.amazon_book_description]
-
-
# Top 100 Projections
-
rankable_projection_details = rank_projections_by_book_version_id[warehouse_book_version_id].select do |_, projection_details|
-
!projection_details[:currently_ranked] && projection_details[:position] <= 100
-
end
-
-
rankable_projection_details.each do |_, projection_details|
-
row << "#{projection_details[:name]} -- #{projection_details[:position]}"
-
end
-
-
report_csv << row
-
end
-
end
-
-
report_csv.flush
-
-
# Pass reports array to mailer and deliver
-
EnterpriseReports.move_to_s3(client_name, report_csv)
-
# EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports]["#{client_name}_identification"]).deliver
-
-
report_csv.close
-
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{client_name.to_s.camelcase} Identification Report Delivered -----"}
-
end
-
-
1
def identify_book_versions(warehouse_date_id, options = {})
-
category_names = options.delete :category_names
-
minimum_creation_date = options.delete :minimum_creation_date
-
minimum_amazon_average_rating = options.delete :minimum_amazon_average_rating
-
minimum_amazon_review_count = options.delete :minimum_amazon_review_count
-
min_page_count = options.delete :min_page_count
-
relation = WarehouseBookVersion.joins(:warehouse_stats).where{warehouse_stats.warehouse_date_id == warehouse_date_id}
-
if category_names.present?
-
book_version_categories = BookVersionCategory.where(category_name: category_names)
-
relation = relation.where(id: book_version_categories.collect(&:warehouse_book_version_id)) if book_version_categories.present?
-
end
-
relation = relation.where{created_at < minimum_creation_date} if minimum_creation_date.present?
-
relation = relation.where{warehouse_stats.amazon_average_rating > minimum_amazon_average_rating} if minimum_amazon_average_rating.present?
-
relation = relation.where{warehouse_stats.amazon_review_count > minimum_amazon_review_count} if minimum_amazon_review_count.present?
-
-
warehouse_book_versions = relation.reject {|warehouse_book_version| EnterpriseReports::StatisticalReports::EXCLUDED_PUBLISHERS.include? warehouse_book_version.publisher}
-
warehouse_book_versions = warehouse_book_versions.reject {|warehouse_book_version| EnterpriseReports::StatisticalReports::EXCLUDED_SOLD_BY.include? warehouse_book_version.sold_by}
-
warehouse_book_versions = warehouse_book_versions.reject {|warehouse_book_version| warehouse_book_version.pages.to_i < min_page_count} if min_page_count
-
-
warehouse_book_versions
-
end
-
end
-
end
-
1
module BookVersionValidationWorkers
-
1
class QueueDiscoverAmazon404sBlocks
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(block_size = RedisUtilities::BLOCK_SIZE)
-
book_version_ids = WarehouseBookVersion.ingested.order(:id).value_of(:id)
-
params = book_version_ids.each_slice(block_size).collect {|id_slice| [id_slice.first, id_slice.last]}
-
Sidekiq::Client.push_bulk('class' => BookVersionValidationWorkers::QueueDiscoverAmazon404s, 'args' => params)
-
end
-
end
-
-
1
class QueueDiscoverAmazon404s
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(starting_book_version_id, ending_book_version_id)
-
params = WarehouseBookVersion.ingested.where(id: starting_book_version_id..ending_book_version_id).value_of(:id, :asin, :tld).collect(&method(:Array))
-
Sidekiq::Client.push_bulk('class' => MongoBookVersionExceptionWorkers::DiscoverAmazon404s, 'args' => params)
-
end
-
end
-
-
1
class QueueValidationBlocks
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(block_size = RedisUtilities::BLOCK_SIZE)
-
1
user_ids = User.where(validate_tracked_book_versions: true).value_of :id
-
1
book_version_ids = TrackedBookVersion.where(user_id: user_ids).order(:warehouse_book_version_id).uniq.value_of(:warehouse_book_version_id)
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "--- Validations running on #{user_ids.count} users for #{book_version_ids.count} tracked isbns---"}
-
3
params = book_version_ids.each_slice(block_size).collect {|id_slice| [id_slice.first, id_slice.last, user_ids]}
-
1
Sidekiq::Client.push_bulk('class' => BookVersionValidationWorkers::QueueValidations, 'args' => params)
-
end
-
end
-
-
1
class QueueValidations
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(starting_book_version_id, ending_book_version_id, user_ids)
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "Validations running on the id block: #{starting_book_version_id} - #{ending_book_version_id}"}
-
1
collection_name = MongoUtilities.daily_collection_name(:book_version_exceptions)
-
1
sql = TrackedBookVersion.select([:warehouse_book_version_id, :metadata]).
-
join_select(:inner, false, warehouse_book_version: [:asin, :tld, :status]).
-
select('COALESCE(warehouse_book_versions.isbn13, warehouse_book_versions.asin) as isbn_or_asin').
-
joins(:warehouse_book_version).
-
where(user_id: user_ids, warehouse_book_version_id: starting_book_version_id..ending_book_version_id).
-
order(:warehouse_book_version_id).to_sql
-
4
amazon_params = TrackedBookVersion.connection.execute(sql).chunk {|res| res['warehouse_book_version_id']}.collect do |_, rows|
-
3
base_row = rows.first
-
3
metadatas = rows.collect do |row|
-
3
row['metadata'].present? ? YAML.load(row['metadata']) : nil
-
end
-
3
url_hints = Utilities.get_url_hints_from_metadata metadatas
-
-
3
[collection_name, base_row['warehouse_book_version_id'], base_row['isbn_or_asin'], base_row['warehouse_book_version_asin'],
-
base_row['warehouse_book_version_tld'], base_row['warehouse_book_version_status'], url_hints]
-
end
-
1
Sidekiq::Client.push_bulk('class' => MongoBookVersionExceptionWorkers::ValidateAmazonSearch, 'args' => amazon_params)
-
4
bn_params = amazon_params.collect {|params| params.first(3)}
-
1
Sidekiq::Client.push_bulk('class' => MongoBookVersionExceptionWorkers::ValidateBarnesAndNoble, 'args' => bn_params)
-
end
-
end
-
-
1
class QueueItunesValidationBlocks
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(block_size = RedisUtilities::BLOCK_SIZE)
-
1
user_ids = User.where(validate_tracked_book_versions_on_itunes: true).value_of :id
-
2
book_version_ids = TrackedBookVersion.where(user_id: user_ids).joins(:warehouse_book_version).where{warehouse_book_version.status == 'ingested'}.order(:warehouse_book_version_id).uniq.value_of(:warehouse_book_version_id)
-
3
params = book_version_ids.each_slice(block_size).collect {|id_slice| [id_slice.first, id_slice.last, user_ids]}
-
1
Sidekiq::Client.push_bulk('class' => BookVersionValidationWorkers::QueueItunesValidations, 'args' => params)
-
end
-
end
-
-
1
class QueueItunesValidations
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
# iTunes Validations: Only validate if the title is ingested, an ebook, has an isbn and is part of rhincactive list
-
1
def perform(starting_book_version_id, ending_book_version_id, user_ids)
-
1
collection_name = MongoUtilities.daily_collection_name(:book_version_exceptions)
-
1
sql = TrackedBookVersion.select(:warehouse_book_version_id).
-
join_select(:inner, false, warehouse_book_version: [:book_format, :status]).
-
select('COALESCE(warehouse_book_versions.isbn13, warehouse_book_versions.asin) as isbn_or_asin').
-
joins(:warehouse_book_version).
-
where(user_id: user_ids, warehouse_book_version_id: starting_book_version_id..ending_book_version_id).
-
order(:warehouse_book_version_id).to_sql
-
6
params = TrackedBookVersion.connection.execute(sql).chunk {|res| res['warehouse_book_version_id']}.collect do |_, rows|
-
5
base_row = rows.first
-
5
if base_row['warehouse_book_version_status'] == 'ingested' && base_row['warehouse_book_version_book_format'].include?('Kindle Edition') && Utilities.is_isbn?(base_row['isbn_or_asin'])
-
2
[collection_name, base_row['warehouse_book_version_id'], base_row['isbn_or_asin']]
-
end
-
end.compact
-
1
Sidekiq::Client.push_bulk('class' => MongoBookVersionExceptionWorkers::ValidateItunes, 'args' => params)
-
end
-
end
-
-
1
class ValidateNewBookVersion
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(book_version_id)
-
6
book_version = WarehouseBookVersion.find book_version_id
-
6
return unless book_version.status == :new
-
-
5
if book_version.isbn_or_asin.blank?
-
1
book_version.update_attributes status: :invalid_on_amazon
-
else
-
4
ProxyUtilities.proxy_setup :amazon
-
4
search_page = AmazonSearchPage.by_isbn_or_asin_and_tld book_version.isbn_or_asin, book_version.tld
-
4
if handle_captcha(search_page, 60, book_version_id)
-
3
valid = !Validations.amazon_not_found_in_search?(search_page) && !Validations.amazon_ambiguous_search_results?(search_page, Utilities.get_url_hints_from_metadata(book_version.tracked_book_versions.collect(&:metadata)))
-
-
3
book_version.update_attributes status: (valid ? :validated : :invalid_on_amazon)
-
end
-
end
-
end
-
end
-
end
-
1
module BookVersionWorkers
-
1
class Create
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :background
-
-
1
def perform(params)
-
5
params = params.with_indifferent_access
-
30
existence_params = params.dup.keep_if {|key, _| %w[isbn13 asin bn_id itunes_id tld].include? key.to_s}
-
5
tld = existence_params.delete :tld
-
5
arel_table = WarehouseBookVersion.arel_table
-
# Check for WarehouseBookVersion where (asin = x AND tld = t) OR (isbn = y AND tld = t) OR (bn_id = z AND tld = t)
-
25
conditions = existence_params.each_pair.collect {|key, value| arel_table[key].eq(value).and(arel_table[:tld].eq(tld))}
-
20
WarehouseBookVersion.create! params unless WarehouseBookVersion.where(conditions.reduce {|final_condition, condition| final_condition.or(condition)}).exists?
-
end
-
end
-
-
1
class Update
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :background
-
-
1
def perform(params)
-
1
params = params.with_indifferent_access
-
1
WarehouseBookVersion.find(params[:id]).update_attributes(params.except(:id))
-
end
-
end
-
-
1
class Ingest
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :background
-
-
1
def perform(warehouse_book_version_id)
-
2
ActiveRecord::Base.connection.uncached do
-
2
begin
-
2
warehouse_book_version = WarehouseBookVersion.find warehouse_book_version_id
-
1
warehouse_book_version.ingest
-
rescue ActiveRecord::RecordNotFound
-
2
Rails.logger.tagged('book_data') {Rails.logger.info "Tried to ingest a book that no longer exists"}
-
end
-
end
-
end
-
end
-
-
1
class QueueValidatedTop100ApiCall
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :ingest_high
-
-
1
def perform(wait_till_full = 0)
-
12
WarehouseBookVersion.com.where{status == 'validated_from_top_100s'}.limit(AmazonApiWorkers::GetManyApiResponses::BATCH_SIZE * 5).each_slice(AmazonApiWorkers::GetManyApiResponses::BATCH_SIZE) do |warehouse_book_versions_slice|
-
19
AmazonApiWorkers::GetManyApiResponses.perform_async warehouse_book_versions_slice.collect(&:id), :asin, '.com' if wait_till_full == 0 || warehouse_book_versions_slice.count == AmazonApiWorkers::GetManyApiResponses::BATCH_SIZE
-
end
-
end
-
end
-
-
1
class MonitorScrapeJobCount
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform(scrape_count_key, worker_class_string)
-
6
return NotificationMailer.scraper_count_error('Scrape count was not properly updated in redis for comparison').deliver if RedisUtilities.get_count(scrape_count_key).blank? || RedisUtilities.get_count(scrape_count_key) == 0
-
-
5
scraping_jobs = [BookVersionWorkers::QueueNightlyScrape, BookVersionWorkers::QueueNightlyScrapeBlock,
-
BookVersionWorkers::QueueAmazonAuthorPageScrape, BookVersionWorkers::QueueAmazonAuthorPageScrapeBlock]
-
5
while Utilities.class_in_sidekiq?(scraping_jobs)
-
2
return NotificationMailer.scraper_count_error('Scrape queuing jobs are repeatedly failing').deliver if Utilities.job_class_in_retry_set?(scraping_jobs, 4)
-
-
1
sleep(10)
-
end
-
-
4
scraping_queue = Sidekiq::Queue.new worker_class_string.constantize.sidekiq_options_hash['queue']
-
4
staging_queue = Sidekiq::Queue.new Sidekiq::Client.convert_to_staging_queue(scraping_queue.name)
-
4
return NotificationMailer.scraper_count_error("Queuing failed, number of queued jobs (#{staging_queue.size}) does not match the scrape count in redis (#{RedisUtilities.get_count scrape_count_key})").deliver if RedisUtilities.get_count(scrape_count_key) != staging_queue.size
-
-
Sidekiq.redis do |conn|
-
conn.rename("queue:#{staging_queue.name}", "queue:#{scraping_queue.name}")
-
conn.sadd 'queues', scraping_queue.name
-
end
-
-
NotificationMailer.scraper_count_error('Queue rename failed, no jobs on the scraping queue').deliver unless scraping_queue.size > 0
-
end
-
end
-
-
1
class QueueNightlyScrape
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(scope)
-
2
Rails.logger.tagged("book_data_#{scope}") {Rails.logger.info "Stat run started at: #{Time.current}"}
-
-
# Create mongo stats collection for today and then apply usePowerof2Sizes to improve storage/performance
-
1
$mongodb.command({'create' => MongoUtilities.daily_collection_name(:stats)})
-
1
$mongodb.command({'collMod' => MongoUtilities.daily_collection_name(:stats), 'usePowerOf2Sizes' => true})
-
-
1
ids = WarehouseBookVersion.send(scope).order(:id).value_of(:id)
-
-
1
scrape_count_key = RedisUtilities.get_scrape_count_key(scope)
-
1
RedisUtilities.set_count scrape_count_key, ids.count
-
1
values = ids.each_slice(RedisUtilities::BLOCK_SIZE).collect do |id_slice|
-
1
[scope, id_slice.first, RedisUtilities::BLOCK_SIZE]
-
end
-
-
1
Sidekiq::Client.push_bulk('class' => BookVersionWorkers::QueueNightlyScrapeBlock, 'args' => values)
-
1
BookVersionWorkers::MonitorScrapeJobCount.perform_async scrape_count_key, WarehouseBookVersion.scope_to_worker_class(scope).to_s
-
-
2
Rails.logger.tagged("book_data_#{scope}") {Rails.logger.info "Results: #{ids.count} book versions queued into #{(ids.count / RedisUtilities::BLOCK_SIZE.to_f).ceil} collections."}
-
end
-
end
-
-
1
class QueueNightlyScrapeBlock
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(scope, starting_id, block_size)
-
8
Rails.logger.tagged("book_data_#{scope}") {Rails.logger.info "Queuing block of #{RedisUtilities::BLOCK_SIZE} starting from #{starting_id} at: #{Time.current}"}
-
-
4
collection_name = MongoUtilities.daily_collection_name(:stats)
-
4
select_statement = "'#{collection_name}' AS collection_name, asin || '-' || tld AS mongo_id, id, asin, COALESCE(isbn13, bn_id) AS ean, tld, itunes_id, book_format"
-
4
select_statement += ', canonical_bn_url' if scope.to_s == 'bn_statable'
-
4
select_statement += ', canonical_goodreads_url' if scope.to_s == 'goodreads_statable'
-
8
sql = WarehouseBookVersion.send(scope).select(select_statement).order(:id).where{id >= starting_id}.limit(block_size).to_sql
-
4
result = WarehouseBookVersion.connection.execute(sql)
-
-
4
Sidekiq::Client.push_bulk_staged('class' => WarehouseBookVersion.scope_to_worker_class(scope), 'args' => result.values) if result.values.present?
-
-
8
Rails.logger.tagged("book_data_#{scope}") {Rails.logger.info "Queuing block of #{RedisUtilities::BLOCK_SIZE} starting from #{starting_id} completed at: #{Time.current}"}
-
end
-
end
-
-
1
class QueueAmazonAuthorPageScrape
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
sql = WarehouseBookVersion.select('distinct on (tld, author_asin) author_asin').where.not(author_asin: nil).group(:tld, :author_asin).to_sql
-
result = WarehouseBookVersion.connection.execute(sql)
-
if result.count > 0
-
scrape_count_key = RedisUtilities.get_scrape_count_key(:amazon_author_page)
-
RedisUtilities.set_count scrape_count_key, result.count
-
values = (0..(result.count / RedisUtilities::BLOCK_SIZE.to_f).floor).collect {|block_number| [block_number, RedisUtilities::BLOCK_SIZE]}
-
-
Sidekiq::Client.push_bulk('class' => BookVersionWorkers::QueueAmazonAuthorPageScrapeBlock, 'args' => values)
-
BookVersionWorkers::MonitorScrapeJobCount.perform_async scrape_count_key, MongoWorkers::GetAmazonAuthorPageStats.to_s
-
end
-
end
-
end
-
-
1
class QueueAmazonAuthorPageScrapeBlock
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(block_number, block_size)
-
select_statement = "distinct on (tld, author_asin) '#{MongoUtilities.daily_collection_name(:stats)}' AS collection_name, array_agg(asin || '-' || tld) as mongo_ids, tld, author_asin"
-
sql = WarehouseBookVersion.select(select_statement).where.not(author_asin: nil).group(:tld, :author_asin).order(:tld, :author_asin).offset(block_number * block_size).limit(block_size).to_sql
-
result = WarehouseBookVersion.connection.execute(sql)
-
params = result.values.collect {|collection_name, mongo_ids, tld, author_asin| [collection_name, mongo_ids[1..-2].split(','), tld, author_asin]}
-
-
Sidekiq::Client.push_bulk_staged('class' => MongoWorkers::GetAmazonAuthorPageStats, 'args' => params) if params.present?
-
end
-
end
-
-
1
class DownloadImage
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :background
-
-
1
def perform(warehouse_book_version_id, url)
-
2
ActiveRecord::Base.connection.uncached do
-
2
warehouse_book_version = WarehouseBookVersion.find warehouse_book_version_id
-
2
if url.present? && Net::HTTP.get_response(URI.parse(URI.encode(url))).code == "200"
-
2
begin
-
2
warehouse_book_version.remote_book_version_image_url = url
-
1
warehouse_book_version.save
-
rescue CarrierWave::ProcessingError
-
# Sometimes file is written incorrectly to tmp and just needs to be retried
-
1
BookVersionWorkers::DownloadImage.perform_async warehouse_book_version_id, url
-
end
-
end
-
end
-
end
-
end
-
-
1
class PopulateWebData
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(warehouse_book_version_id, asin, tld, author_name)
-
2
ProxyUtilities.proxy_setup :amazon
-
2
page = AmazonProductPage.by_asin_and_tld asin, tld
-
2
if handle_captcha(page, 60, warehouse_book_version_id, asin, tld, author_name)
-
1
params = {id: warehouse_book_version_id}
-
1
params[:sold_by] = page.scrape_sold_by
-
1
params[:amazon_book_description] = page.scrape_amazon_description
-
1
params[:author_asin] = page.scrape_author_asin
-
-
1
if params[:author_asin].blank? && page.scrape_author_page_url.present? && page.scrape_author_name.present?
-
BookVersionWorkers::SetAuthorAsinFromAuthorPage.perform_async warehouse_book_version_id, page.scrape_author_page_url, page.scrape_author_name
-
end
-
-
1
BookVersionWorkers::Update.perform_async params
-
end
-
end
-
end
-
-
1
class SetAuthorAsinFromAuthorPage
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(warehouse_book_version_id, author_page_url, scraped_author_name)
-
ProxyUtilities.proxy_setup :amazon
-
page = AmazonPage.new author_page_url
-
-
if handle_captcha(page, 60, warehouse_book_version_id, author_page_url, scraped_author_name)
-
params = {id: warehouse_book_version_id}
-
author_links = page.dom.search(".//span[@class='ptBrand']/a") # old format
-
author_links = page.dom.css('h3.newaps span a') if author_links.blank? # old format
-
-
if author_links.present?
-
author_links.each do |author_link|
-
if author_link.text.strip == scraped_author_name
-
author_link_href = author_link['href']
-
params[:author_asin] = author_link_href.split('/')[3]
-
end
-
end
-
end
-
-
BookVersionWorkers::Update.perform_async params if params[:author_asin].present?
-
end
-
end
-
end
-
-
1
class UpdateWebData
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(warehouse_book_version_id, asin, tld)
-
2
ProxyUtilities.proxy_setup :amazon
-
2
page = AmazonProductPage.by_asin_and_tld asin, tld
-
2
if handle_captcha(page, 60, warehouse_book_version_id, asin, tld)
-
1
params = {id: warehouse_book_version_id}
-
1
params[:sold_by] = page.scrape_sold_by
-
1
params[:amazon_book_description] = page.scrape_amazon_description
-
1
params[:physical_details] = page.physical_details
-
1
params[:publisher] = page.scrape_publisher
-
1
params[:pub_date] = page.scrape_pub_date
-
1
params[:pages] = page.scrape_page_count
-
1
BookVersionWorkers::Update.perform_async params
-
end
-
end
-
end
-
-
1
class PopulateCanonicalUrls
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(warehouse_book_version_id)
-
4
ActiveRecord::Base.connection.uncached do
-
4
ProxyUtilities.force_proxy
-
4
warehouse_book_version = WarehouseBookVersion.find warehouse_book_version_id
-
-
4
if warehouse_book_version.tld == '.com'
-
4
warehouse_book_version.canonical_bn_url = HttpHelper.get_canonical_bn_url warehouse_book_version.isbn13, warehouse_book_version.tld
-
4
warehouse_book_version.canonical_goodreads_url = HttpHelper.get_canonical_goodread_url warehouse_book_version.isbn_or_asin if ISBN_Tools.is_valid? warehouse_book_version.isbn_or_asin
-
4
warehouse_book_version.canonical_goodreads_url ||= HttpHelper.get_canonical_goodread_url warehouse_book_version.asin
-
end
-
-
4
warehouse_book_version.save!
-
end
-
end
-
end
-
-
1
class ScheduleGetItunesMetadata
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
1
ActiveRecord::Base.connection.uncached do
-
3
WarehouseBookVersion.ingested.where{isbn13 != nil}.where(itunes_id: nil).where{(book_format == 'Kindle Edition') | (book_format == 'Kindle Edition with Audio/Video')}.value_of(:id).each_slice(RedisUtilities::BLOCK_SIZE) do |warehouse_book_version_ids|
-
1
Sidekiq::Client.push_bulk('class' => BookVersionWorkers::GetItunesMetadata, 'args' => warehouse_book_version_ids.collect(&method(:Array)))
-
end
-
end
-
end
-
end
-
-
1
class ScheduleGermanCompetitiveScrape
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
ActiveRecord::Base.connection.uncached do
-
isbn_to_work_id = {}
-
MongoUtilities.work_id_collection.find.each {|x| isbn_to_work_id[x['_id']] = x['work_id']}
-
-
book_values = User.find_by_email('rhde@booklr.com').warehouse_book_versions.ingested.where(book_format: ['Paperback', 'Hardcover', 'Mass Market Paperback', 'Kindle Edition', 'Kindle Edition with Audio/Video', 'Board Book']).value_of(:asin, :book_format, :isbn13)
-
-
values = book_values.map {|value_array| value_array + [isbn_to_work_id[value_array[2]]] }
-
-
values.each_slice(10000) do |slice|
-
Sidekiq::Client.push_bulk('class' => MongoWorkers::GermanCompetitiveCoverage, 'args' => slice)
-
end
-
end
-
end
-
end
-
-
1
class GetItunesMetadata
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :background
-
-
1
def perform(warehouse_book_version_id)
-
3
ActiveRecord::Base.connection.uncached do
-
3
warehouse_book_version = WarehouseBookVersion.find warehouse_book_version_id
-
3
if warehouse_book_version.isbn13.present? && warehouse_book_version.book_format.include?('Kindle')
-
1
metadata = ItunesApi.get_metadata_by_isbn13 warehouse_book_version.isbn13
-
1
warehouse_book_version.update_attributes! metadata unless metadata[:itunes_id].blank? || WarehouseBookVersion.where(itunes_id: metadata[:itunes_id]).exists?
-
end
-
end
-
end
-
end
-
-
1
class QueueSyncMongoAsinList
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
2
Rails.logger.tagged('sync') {Rails.logger.info 'Starting asin list sync to Mongo'}
-
1
values = WarehouseBookVersion.order(:id).value_of(:id).each_slice(50000).collect do |id_slice|
-
1
[id_slice.first, id_slice.last]
-
end
-
-
1
Sidekiq::Client.push_bulk('class' => BookVersionWorkers::SyncMongoAsinList, 'args' => values)
-
end
-
end
-
-
1
class SyncMongoAsinList
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :etl
-
-
1
def perform(starting_warehouse_book_version_id, ending_warehouse_book_version_id)
-
1
ActiveRecord::Base.connection.uncached do
-
1
all_asin_documents = WarehouseBookVersion.where(id: starting_warehouse_book_version_id..ending_warehouse_book_version_id).value_of(:asin, :tld).uniq.compact.collect do |asin, tld|
-
2
MongoUtilities.all_asin_document asin, tld
-
end
-
-
1
MongoUtilities.add_documents_to_all_asin_list all_asin_documents
-
end
-
end
-
end
-
-
1
class UpdateStatuses
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :etl
-
-
1
def perform
-
2
collection = BookVersionStatusCollection.new
-
2
warehouse_book_version_ids_by_status = collection.find.each_with_object({}.with_indifferent_access) do |record, hash|
-
3
hash[record['status']] ||= []
-
3
hash[record['status']] << record['_id']
-
end
-
-
2
warehouse_book_version_ids_by_status.each_pair do |status, warehouse_book_version_ids|
-
3
WarehouseBookVersion.where(id: warehouse_book_version_ids).update_all(status: status)
-
end
-
-
2
collection.drop
-
end
-
end
-
end
-
1
module DataCleanupWorkers
-
1
class PopulateMissingIsbnsFromApi
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :background
-
-
1
def perform(book_version_id)
-
1
ActiveRecord::Base.connection.uncached do
-
1
book_version = WarehouseBookVersion.find book_version_id
-
1
DataCleanup.populate_missing_isbn13_from_api book_version
-
end
-
end
-
end
-
-
#class BackfillMissingData
-
# include Sidekiq::Worker
-
# sidekiq_options queue: :background
-
#
-
# def perform(book_version_id)
-
# ActiveRecord::Base.connection.uncached do
-
# book_version = WarehouseBookVersion.find book_version_id
-
# DataCleanup.backfill_missing_data book_version
-
# end
-
# end
-
#end
-
-
#class PopulateEanFromAsin
-
# include Sidekiq::Worker
-
# sidekiq_options queue: :background
-
#
-
# def perform(book_version_id)
-
# ActiveRecord::Base.connection.uncached do
-
# book_version = WarehouseBookVersion.find book_version_id
-
# ean = DataCleanup.find_ean_from_asin book_version
-
#
-
# # if EAN starts with 294 then it is a BN ID and we use bn_id, otherwise it is an isbn13
-
# if ean.present?
-
# if WarehouseBookVersion.where{((isbn13 == ean) | (bn_id == ean)) & (tld == '.com')}.exists?
-
# Rails.logger.tagged('book_data') {Rails.logger.info "ean match found for #{book_version.id} but key #{ean} already exists in db"}
-
# else
-
# key_type = Utilities.determine_key_type(ean)
-
# key_type == :bn_id ? book_version.update_attributes(bn_id: ean) : book_version.update_attributes(isbn13: ean)
-
# Rails.logger.tagged('book_data') {Rails.logger.info "found matching #{key_type}: #{ean} and setting #{key_type}"}
-
# end
-
# end
-
# end
-
# end
-
#end
-
end
-
1
module EnterpriseReports
-
1
module DailyReports
-
1
class RHPG
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string, deliver_email = true)
-
2
report_date = report_date_string.to_date
-
2
client_name = :rhpg
-
2
client_config = AmazeBot.config[:reports][:clients][client_name]
-
-
2
users = User.where(email: %w[rhpg@booklr.com randomhousecomps@booklr.com]).to_a
-
-
2
users.each do |user|
-
4
asins = user.warehouse_book_versions.ingested.value_of(:asin)
-
8
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Report on #{asins.count} book versions for: #{user.name} -----"}
-
-
4
amazon_rank_report_hash = EnterpriseReports.generate_report_hash("amazon-rank-#{report_date.strftime('%m%d%y')}-#{user.name.parameterize}", client_name)
-
4
amazon_rank_csv = EnterpriseReports.open_csv(amazon_rank_report_hash)
-
-
4
amazon_price_report_hash = EnterpriseReports.generate_report_hash("amazon-price-#{report_date.strftime('%m%d%y')}-#{user.name.parameterize}", client_name)
-
4
amazon_price_csv = EnterpriseReports.open_csv(amazon_price_report_hash)
-
-
4
bn_rank_report_hash = EnterpriseReports.generate_report_hash("bn-rank-#{report_date.strftime('%m%d%y')}-#{user.name.parameterize}", client_name)
-
4
bn_rank_csv = EnterpriseReports.open_csv(bn_rank_report_hash)
-
-
4
amazon_rank_header = ['Book Title', 'Author', 'ISBN/ASIN', 'Book Type', 'Category']
-
4
other_header = ['Book Title', 'Author', 'ISBN/ASIN', 'Book Type']
-
4
days = (report_date.yesterday).upto(report_date).to_a
-
4
warehouse_date_ids = WarehouseDate.where(date: days).order(:date).value_of :id
-
4
days.each do |day|
-
8
amazon_rank_header << day.strftime('%m/%d/%Y')
-
8
other_header << day.strftime('%m/%d/%Y')
-
end
-
-
4
amazon_rank_header << 'Percent Change'
-
4
other_header << 'Percent Change'
-
# Append header row + last 7 date days
-
4
amazon_rank_csv << amazon_rank_header
-
4
amazon_price_csv << other_header
-
4
bn_rank_csv << other_header
-
-
4
warehouse_book_version_ids = WarehouseBookVersion.com.where(asin: asins).value_of :id
-
4
warehouse_region_id = WarehouseRegion.com.id
-
4
row_count = 0
-
-
4
(warehouse_book_version_ids.count / 1000 + 1).times do |count|
-
4
sql = WarehouseStat.single_query_join_and_select(WarehouseStat::WAREHOUSE_STAT_FIELDS,
-
{warehouse_book_version: %w[id title isbn13 book_format author_name]},
-
{warehouse_amazon_sales_rank_category: %w[name],
-
warehouse_amazon_category1: %w[name],
-
warehouse_amazon_category2: %w[name],
-
4
warehouse_amazon_category3: %w[name]}).where{warehouse_stats.warehouse_book_version_id.in warehouse_book_version_ids[(1000 * count)..(1000 * (count + 1) - 1)]}.where(warehouse_region_id: warehouse_region_id, warehouse_date_id: warehouse_date_ids).order(:warehouse_book_version_id, :created_at).to_sql
-
-
12
ActiveRecord::Base.connection.execute(sql).chunk {|el| el['warehouse_book_version_id']}.each do |_, stats|
-
12
day_1_stat = stats.select {|stat| stat['warehouse_date_id'] == warehouse_date_ids.first.to_s}.last.try(:with_indifferent_access)
-
12
day_2_stat = stats.select {|stat| stat['warehouse_date_id'] == warehouse_date_ids.last.to_s}.last.try(:with_indifferent_access)
-
4
stats = [day_1_stat, day_2_stat]
-
4
reference_stat = day_1_stat || day_2_stat
-
4
base_row = [reference_stat[:warehouse_book_version_title], reference_stat[:warehouse_book_version_author_name], EnterpriseReports.isbn_output(reference_stat[:warehouse_book_version_isbn13]), reference_stat[:warehouse_book_version_book_format]]
-
-
4
amazon_rank_base_row = (base_row + [nil, nil, nil])
-
4
[:amazon_sales, :amazon_category1, :amazon_category2, :amazon_category3].each do |method|
-
16
if day_1_stat.present? && day_1_stat.send(:[], "#{method}_rank").present? || day_2_stat.present? && day_2_stat.send(:[], "#{method}_rank").present?
-
4
el = day_1_stat || day_2_stat
-
4
amazon_rank_stats_row = amazon_rank_base_row.dup
-
4
amazon_rank_stats_row[4] = method == :amazon_sales ? el[:warehouse_amazon_sales_rank_category_name] : el.send(:[], "warehouse_#{method}_name")
-
4
amazon_rank_stats_row[5] = day_1_stat.try :send, :[], "#{method}_rank"
-
4
amazon_rank_stats_row[6] = day_2_stat.try :send, :[], "#{method}_rank"
-
4
if amazon_rank_stats_row[5].present? && amazon_rank_stats_row[6].present?
-
4
amazon_rank_stats_row[7] = "#{EnterpriseReports.percent_change_from_for_rank amazon_rank_stats_row[5].to_i, amazon_rank_stats_row[6].to_i}%"
-
end
-
4
amazon_rank_csv << amazon_rank_stats_row
-
end
-
end
-
-
4
amazon_price_stats_row = base_row.dup
-
4
bn_rank_stats_row = base_row.dup
-
4
bn_rank_stats_row[3] = (reference_stat[:warehouse_book_version_book_format].include?('Kindle') ? 'NOOK Book' : reference_stat[:warehouse_book_version_book_format])
-
-
4
if stats[0].present?
-
4
amazon_price_stats_row[4] = stats[0][:amazon_price].present? ? stats[0][:amazon_price].to_i / 100.0 : nil
-
4
bn_rank_stats_row[4] = stats[0][:bn_sales_rank]
-
end
-
-
4
if stats[1].present?
-
4
amazon_price_stats_row[5] = stats[1][:amazon_price].present? ? stats[1][:amazon_price].to_i / 100.0 : nil
-
4
bn_rank_stats_row[5] = stats[1][:bn_sales_rank]
-
end
-
-
4
amazon_price_stats_row[6] = "#{EnterpriseReports.percent_change_from amazon_price_stats_row[4], amazon_price_stats_row[5]}%" if amazon_price_stats_row[4].present? && amazon_price_stats_row[5].present?
-
4
bn_rank_stats_row[6] = "#{EnterpriseReports.percent_change_from_for_rank bn_rank_stats_row[4].to_i, bn_rank_stats_row[5].to_i}%" if bn_rank_stats_row[4].present? && bn_rank_stats_row[5].present?
-
-
4
amazon_price_csv << amazon_price_stats_row
-
4
bn_rank_csv << bn_rank_stats_row
-
4
row_count += 1
-
end
-
end
-
-
# Upload finished report whether its complete or not
-
4
EnterpriseReports.move_to_s3(client_name, amazon_rank_csv)
-
4
EnterpriseReports.move_to_s3(client_name, amazon_price_csv)
-
4
EnterpriseReports.move_to_s3(client_name, bn_rank_csv)
-
-
# Determine if report is complete and then email about it and set redis details
-
4
if EnterpriseReports.report_count_valid? row_count, asins.count
-
2
$redis.hmset('daily_report_stats', 'rhpg-row-count', row_count, 'rhpg-send-time', Time.current.to_s)
-
2
EnterpriseReportsMailer.basic_report([amazon_rank_report_hash, amazon_price_report_hash, bn_rank_report_hash], client_config[:reports][:daily_stats]).deliver if deliver_email
-
4
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{user.name.parameterize} Report Delivered -----"}
-
else
-
2
EnterpriseReports.send_report_count_error "#{client_name}-daily", row_count, asins.count
-
4
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{user.name.parameterize}(#{client_name}) Report NOT Delivered: row count off by 0.5% of more -----"}
-
end
-
-
4
amazon_rank_csv.close
-
4
amazon_price_csv.close
-
4
bn_rank_csv.close
-
end
-
end
-
end
-
end
-
end
-
1
module EnterpriseReports
-
1
module ExceptionReports
-
1
class RHPG
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(user_email, date_string)
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info '--- Generating RHPG Exception Report ---'}
-
1
user = User.find_by email: user_email
-
1
date = date_string.to_date
-
-
1
client_name = :rhpg
-
1
client_config = AmazeBot.config[:reports][:clients][client_name]
-
-
1
exception_report_header = ['Book Title', 'Author', 'ISBN/ASIN']
-
-
1
file_name_ending = client_name
-
-
1
amazon_report_hash = EnterpriseReports.generate_report_hash("amazon-exception-report-#{date.strftime('%m%d%y')}-#{file_name_ending}", client_name)
-
1
amazon_exception_report_csv = EnterpriseReports.open_csv(amazon_report_hash)
-
1
amazon_exception_report_csv << exception_report_header
-
-
1
bn_report_hash = EnterpriseReports.generate_report_hash("bn-exception-report-#{date.strftime('%m%d%y')}-#{file_name_ending}", client_name)
-
1
bn_exception_report_csv = EnterpriseReports.open_csv(bn_report_hash)
-
1
bn_exception_report_csv << exception_report_header
-
-
1
warehouse_date_id = WarehouseDate.find_by(date: date).id
-
-
1
user.tracked_book_versions.includes(:warehouse_book_version).find_each do |tracked_book_version|
-
1
if tracked_book_version.warehouse_book_version.book_version_exceptions.where(warehouse_date_id: warehouse_date_id).where(amazon_not_found_in_search: true).exists?
-
1
amazon_exception_report_csv << EnterpriseReports.get_rhpg_exception_report_row(tracked_book_version)
-
end
-
1
if tracked_book_version.warehouse_book_version.book_version_exceptions.where(warehouse_date_id: warehouse_date_id).where(bn_not_found_in_search: true).exists?
-
1
bn_exception_report_csv << EnterpriseReports.get_rhpg_exception_report_row(tracked_book_version)
-
end
-
end
-
-
1
EnterpriseReports.move_to_s3(client_name, amazon_exception_report_csv)
-
1
EnterpriseReports.move_to_s3(client_name, bn_exception_report_csv)
-
1
EnterpriseReportsMailer.basic_report([amazon_report_hash, bn_report_hash], client_config[:reports][:daily_exception]).deliver
-
-
1
amazon_exception_report_csv.close
-
1
bn_exception_report_csv.close
-
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info '--- RHPG Exception Report Completed ---'}
-
end
-
end
-
-
1
class RandomHouseCorporate
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(user_email, date_string)
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info '--- Generating RHINC Exception Report ---'}
-
1
user = User.find_by email: user_email
-
1
date = date_string.to_date
-
-
1
client_name = :rhinc
-
1
client_config = AmazeBot.config[:reports][:clients][client_name]
-
-
1
exception_report_header = ['Book Title', 'Author', 'ISBN', 'ASIN', 'Book Format', 'Division Code', 'Timestamp', 'Missing On Amazon', 'Ambiguous Results', 'Buy Button Missing Amazon', 'Price Missing Amazon', 'Missing on BN', 'Missing on Itunes']
-
-
1
file_name_ending = client_name
-
-
1
report_hash = EnterpriseReports.generate_report_hash("exception-report-#{date.strftime('%m%d%y')}-#{file_name_ending}", client_name)
-
1
exception_report_csv = EnterpriseReports.open_csv(report_hash)
-
1
exception_report_csv << exception_report_header
-
-
1
warehouse_date_id = WarehouseDate.find_by(date: date).id
-
1
user.tracked_book_versions.includes(warehouse_book_version: :book_version_exceptions).where('book_version_exceptions.warehouse_date_id = ?', warehouse_date_id).references(:book_version_exceptions).find_each do |tracked_book_version|
-
1
book_version_exception = tracked_book_version.warehouse_book_version.book_version_exceptions.last
-
-
# If any of the validations show up as invalid, output this row, except do not include amazon image
-
# validations since those arent shown yet in this report
-
exception_report_csv << EnterpriseReports.get_rhinc_exception_report_row(tracked_book_version, book_version_exception) if
-
[:bn_not_found_in_search, :no_isbn, :amazon_not_found_in_search, :amazon_no_buy_button, :amazon_no_price,
-
2
:amazon_ambiguous_result, :apple_invalid].any? {|method| book_version_exception.send method}
-
end
-
-
1
EnterpriseReports.move_to_s3(client_name, exception_report_csv)
-
1
EnterpriseReports.ftp_to_client(client_name, exception_report_csv)
-
1
EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports][:daily_exception]).deliver
-
-
1
exception_report_csv.close
-
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "--- RHINC Exception Report Completed ---"}
-
end
-
end
-
-
1
class Vook
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(user_email, date_string)
-
Rails.logger.tagged('enterprise') {Rails.logger.info '--- Generating Vook Exception Report ---'}
-
user = User.find_by email: user_email
-
date = date_string.to_date
-
-
client_name = :vook
-
client_config = AmazeBot.config[:reports][:clients][client_name]
-
-
exception_report_header = ['Book Title', 'Author', 'ISBN', 'ASIN', 'Book Format', 'Division Code', 'Timestamp', 'Missing On Amazon', 'Ambiguous Results', 'Buy Button Missing Amazon', 'Price Missing Amazon', 'Missing on BN', 'Missing on Itunes']
-
-
file_name_ending = client_name
-
-
report_hash = EnterpriseReports.generate_report_hash("exception-report-#{date.strftime('%m%d%y')}-#{file_name_ending}", client_name)
-
exception_report_csv = EnterpriseReports.open_csv(report_hash)
-
exception_report_csv << exception_report_header
-
-
warehouse_date_id = WarehouseDate.find_by(date: date).id
-
user.tracked_book_versions.includes(warehouse_book_version: :book_version_exceptions).where('book_version_exceptions.warehouse_date_id = ?', warehouse_date_id).references(:book_version_exceptions).find_each do |tracked_book_version|
-
book_version_exception = tracked_book_version.warehouse_book_version.book_version_exceptions.last
-
-
# If any of the validations show up as invalid, output this row, except do not include amazon image
-
# validations since those arent shown yet in this report
-
exception_report_csv << EnterpriseReports.get_rhinc_exception_report_row(tracked_book_version, book_version_exception) if
-
[:bn_not_found_in_search, :no_isbn, :amazon_not_found_in_search, :amazon_no_buy_button, :amazon_no_price,
-
:amazon_ambiguous_result, :apple_invalid].any? {|method| book_version_exception.send method}
-
end
-
-
EnterpriseReports.move_to_s3(client_name, exception_report_csv)
-
EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports][:daily_exception]).deliver
-
-
exception_report_csv.close
-
-
Rails.logger.tagged('enterprise') {Rails.logger.info "--- RHINC Exception Report Completed ---"}
-
end
-
end
-
end
-
end
-
1
module EnterpriseReports
-
1
module ListStatReports
-
1
AMAZON_CATEGORY_LISTS = {
-
perseus: {category_ids: %w[2365 2376 2394 2396 2399 171115 11322 3639 3573 3887 280311 21 11970 11119 4810 4869 16244431 5015 4837 4861 4853 4884 4891 4935 4947 4952 4954 4939 4948 4956 5032 4978 15812171 197501011 5011 14450 5020 5026 5027 5028 5030 5031 5035 720360 10177 4682 10753 11320 11401 11019 11232 12292 12300 12350 12360 12735 13871 13884 16272]},
-
libboo: {category_ids: %w[157055011 154607011 154754011 319635011 157052011 156154011 158576011 157050011 156576011 157060011 157305011 157430011 157078011 157626011 158125011 158280011 158566011 158597011 158591011 156295011]},
-
booklr: {category_ids: %w[2365 2376]},
-
pll: {category_ids: %w[3511261011 6110890011 6064558011 6064559011 6064561011 6064560011 6064565011 6110891011 6064562011 6064564011 6190488011]},
-
rhinc: {names: 'Kindle Store > Kindle eBooks'}
-
}.freeze
-
-
1
class AmazonTop100
-
1
include EnterpriseReports
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(client_name, report_name, category_ids_or_names, start_date_string, end_date_string)
-
4
date_range = start_date_string.to_date..end_date_string.to_date
-
4
warehouse_date_ids = WarehouseDate.where(date: date_range).order(:date).value_of :id
-
4
category_ids_or_names = category_ids_or_names.with_indifferent_access
-
4
warehouse_region_id = WarehouseRegion.com.id
-
8
warehouse_category_ids = WarehouseCategory.com.canonical.where{(category_id.in category_ids_or_names[:category_ids]) | (name.in category_ids_or_names[:names])}.value_of :id
-
4
expected_count = warehouse_date_ids.count * warehouse_category_ids.count * 100
-
-
4
report_file_name = date_range.count == 1 ? "#{client_name}-#{report_name.to_s.dasherize}-#{date_range.first.strftime("%m%d%y")}" : "#{client_name}-#{report_name.to_s.dasherize}-#{date_range.first.strftime("%m%d%y")}-#{date_range.last.strftime("%m%d%y")}"
-
4
report_hash = EnterpriseReports.generate_report_hash report_file_name, client_name
-
4
report_csv = EnterpriseReports.open_csv(report_hash)
-
report_csv << ['List Name', 'List Rank', 'Author', 'Title', 'ASIN', 'Price', 'List Rank Date', 'Other category 1', 'Other category 1 Rank',
-
4
'Other category 2', 'Other category 2 Rank', 'Other category 3', 'Other category 3 Rank', 'Other category 4', 'Other category 4 Rank', 'Other category Rank Date']
-
-
4
list_stat_sql = WarehouseListStat.single_query_join_and_select('*',
-
{warehouse_date: %w[date],
-
warehouse_category: %w[name category_id]},
-
{warehouse_book_version: %w[id title asin author_name],
-
4
warehouse_trend: %w[name]}).where{warehouse_category_id.in warehouse_category_ids}.where(warehouse_date_id: warehouse_date_ids).order('warehouse_category_name, warehouse_date_date DESC, rank').to_sql
-
4
list_stat_results = ActiveRecord::Base.connection.execute(list_stat_sql)
-
704
asins = list_stat_results.collect {|result| result["warehouse_book_version_asin"]}.compact.uniq
-
4
warehouse_book_version_ids = WarehouseBookVersion.com.where(asin: asins).value_of(:id)
-
-
4
warehouse_stat_sql = WarehouseStat.single_query_join_and_select('*',
-
{warehouse_book_version: %w[id title asin],
-
warehouse_date: %w[date]},
-
{warehouse_amazon_sales_rank_category: %w[name],
-
warehouse_amazon_category1: %w[name],
-
warehouse_amazon_category2: %w[name],
-
4
warehouse_amazon_category3: %w[name]}).where{warehouse_stats.warehouse_book_version_id.in warehouse_book_version_ids}.where(warehouse_date_id: warehouse_date_ids, warehouse_region_id: warehouse_region_id).to_sql
-
4
warehouse_stat_results = ActiveRecord::Base.connection.execute(warehouse_stat_sql).collect{|result| result}
-
-
4
list_stat_results.each do |list_stat_result|
-
700
row = [list_stat_result['warehouse_category_name'], list_stat_result['rank'],
-
list_stat_result['warehouse_book_version_author_name'] || list_stat_result['author'],
-
list_stat_result['warehouse_book_version_title'] || list_stat_result['title'],
-
list_stat_result['warehouse_book_version_asin'] || list_stat_result['asin'],
-
list_stat_result['price'].try(:to_i).try(:/, 100.0), list_stat_result['warehouse_date_date']]
-
700
warehouse_stat = warehouse_stat_results.find {|warehouse_stat_result| warehouse_stat_result["warehouse_book_version_asin"] == list_stat_result["asin"]}
-
700
other_ranks = []
-
700
if warehouse_stat.present?
-
sales_rank_category_name = warehouse_stat['warehouse_amazon_sales_rank_category_name'] || warehouse_stat['warehouse_amazon_sales_rank_category_id_fallback']
-
category1_name = warehouse_stat['warehouse_amazon_category1_name'] || warehouse_stat['warehouse_amazon_category1_id_fallback']
-
category2_name = warehouse_stat['warehouse_amazon_category2_name'] || warehouse_stat['warehouse_amazon_category2_id_fallback']
-
category3_name = warehouse_stat['warehouse_amazon_category3_name'] || warehouse_stat['warehouse_amazon_category3_id_fallback']
-
other_ranks += [sales_rank_category_name, warehouse_stat['amazon_sales_rank']] unless sales_rank_category_name == list_stat_result['warehouse_category_name']
-
other_ranks += [category1_name, warehouse_stat['amazon_category1_rank']] unless category1_name == list_stat_result['warehouse_category_name']
-
other_ranks += [category2_name, warehouse_stat['amazon_category2_rank']] unless category2_name == list_stat_result['warehouse_category_name']
-
other_ranks += [category3_name, warehouse_stat['amazon_category3_rank']] unless category3_name == list_stat_result['warehouse_category_name']
-
end
-
700
row += EnterpriseReports.pad_serialized_data(other_ranks, 8) do |amazon_list_stat_sales_ranks|
-
amazon_list_stat_sales_ranks
-
end
-
700
row += [list_stat_result['warehouse_date_date']]
-
700
report_csv << row
-
end
-
-
4
report_csv.flush
-
-
4
begin
-
4
if EnterpriseReports.report_count_valid?(list_stat_results.count, expected_count)
-
4
EnterpriseReports.move_to_s3(client_name, report_csv)
-
3
EnterpriseReportsMailer.basic_report(report_hash, get_report_email_details(client_name, report_name)).deliver
-
3
EnterpriseReports.ftp_to_rhpg(report_hash[:report_location]) if client_name.to_sym == :rhinc
-
else
-
EnterpriseReports.send_report_count_error report_file_name, list_stat_results.count, expected_count
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_file_name} Report NOT Delivered: row count off by 0.5% of more -----"}
-
end
-
1
rescue *HTTP_ERRORS => e
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "Error trying to send top 100 amazon email or upload to FTP: #{e}"}
-
ensure
-
4
report_csv.close
-
4
end
-
end
-
end
-
-
1
class AmazonAllTop100
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(client_name, start_date_string = nil, end_date_string = nil)
-
start_date = start_date_string.try(:to_date) || Date.current
-
end_date = end_date_string.try(:to_date) || Date.current
-
warehouse_date_ids = WarehouseDate.where(date: (start_date..end_date)).order(:date).value_of :id
-
date_range = start_date..end_date
-
report_file_name = date_range.count == 1 ? "amazon-all-top-100-#{date_range.first.strftime("%m%d%y")}" : "amazon-all-top-100-#{date_range.first.strftime("%m%d%y")}-#{date_range.last.strftime("%m%d%y")}"
-
report_hash = EnterpriseReports.generate_report_hash report_file_name, client_name
-
-
-
list_stat_sql = WarehouseListStat.single_query_join_and_select('warehouse_category.name as "List Name", warehouse_list_stats.rank as "List Rank", warehouse_list_stats.asin as "ASIN", warehouse_list_stats.title as "Title", warehouse_list_stats.author as "Author", round(warehouse_list_stats.price / 100.0, 2) as "Price", warehouse_date.date as "List Rank Date"',
-
{warehouse_date: [],
-
warehouse_category: []},
-
nil).where(warehouse_date_id: warehouse_date_ids).order('"List Name", "List Rank Date" DESC, rank').to_sql
-
report_csv = EnterpriseReports.sql_copy_to_csv(:amazon_all_top_100, report_hash[:report_location], WarehouseListStat.connection.raw_connection, list_stat_sql)
-
-
begin
-
EnterpriseReports.move_to_s3(client_name, report_csv)
-
rescue *HTTP_ERRORS => e
-
Rails.logger.tagged('enterprise') {Rails.logger.info "Error trying to send top 100 amazon email or upload to FTP: #{e}"}
-
ensure
-
report_csv.close
-
end
-
end
-
end
-
-
1
class GenerateBarnesAndNobleBestSellersReport
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string = Date.current.to_s)
-
2
report_date = report_date_string.to_date
-
2
warehouse_date_id = WarehouseDate.where(date: report_date).value_of :id
-
2
client_name = :rhinc
-
2
client_config = AmazeBot.config[:reports][:clients][client_name]
-
2
report_file_name = "top-100-barnes-and-noble-#{report_date.strftime("%m%d%y")}"
-
2
report_hash = EnterpriseReports.generate_report_hash report_file_name, client_name
-
2
report_csv = EnterpriseReports.open_csv(report_hash)
-
2
report_csv << ['Rank', 'Author', 'Title', 'ISBN', 'BN ID', 'Price', 'Date']
-
2
row_count = 0
-
-
2
WarehouseListStat.where(warehouse_date_id: warehouse_date_id, name: MongoListStatWorkers::BarnesNobleTop100::LIST_NAMES[:bn_nook_book_bestsellers]).order(:rank).each do |warehouse_list_stat|
-
200
report_csv << [warehouse_list_stat.rank, warehouse_list_stat.author, warehouse_list_stat.title, warehouse_list_stat.isbn, warehouse_list_stat.bn_id, warehouse_list_stat.price.try(:/, 100.0), report_date]
-
200
row_count += 1
-
end
-
2
report_csv.flush
-
-
2
begin
-
# This report should always put out 100 rows
-
2
if EnterpriseReports.report_count_valid?(row_count, 100)
-
2
EnterpriseReports.move_to_s3(client_name, report_csv)
-
2
EnterpriseReports.ftp_to_client(client_name, report_csv)
-
1
EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports][:bn_top_100]).deliver
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "Report Delivered to email lists and copied to production"}
-
else
-
EnterpriseReports.send_report_count_error report_file_name, row_count, 100
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_file_name} Report NOT Delivered: row count off by 0.5% of more -----"}
-
end
-
1
rescue *HTTP_ERRORS => e
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "Error trying to send top 100 bn bestsellers email or upload to FTP"}
-
ensure
-
2
report_csv.close
-
2
end
-
end
-
end
-
-
1
class GenerateBarnesAndNobleTop100Report
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(date_string = Date.current.to_s)
-
1
report_date = date_string.to_date
-
1
warehouse_date_id = WarehouseDate.where(date: report_date).value_of :id
-
1
client_name = :booklr
-
1
client_config = AmazeBot.config[:reports][:clients][client_name]
-
1
report_file_name = "real-top-100-barnes-and-noble-#{report_date.strftime("%m%d%y")}"
-
1
report_hash = EnterpriseReports.generate_report_hash report_file_name, client_name
-
1
report_csv = EnterpriseReports.open_csv(report_hash)
-
1
report_csv << ['Rank', 'Author', 'Title', 'ISBN', 'BN ID', 'Price', 'Date']
-
1
row_count = 0
-
-
1
WarehouseListStat.where(warehouse_date_id: warehouse_date_id, name: MongoListStatWorkers::BarnesNobleTop100::LIST_NAMES[:bn_top_100_nook_books]).order(:rank).each do |warehouse_list_stat|
-
100
report_csv << [warehouse_list_stat.rank, warehouse_list_stat.author, warehouse_list_stat.title, warehouse_list_stat.isbn, warehouse_list_stat.bn_id, warehouse_list_stat.price.try(:/, 100.0), report_date]
-
100
row_count += 1
-
end
-
1
report_csv.flush
-
-
# This report should always put out 100 rows
-
1
if EnterpriseReports.report_count_valid?(row_count, 100)
-
1
EnterpriseReports.move_to_s3(client_name, report_csv)
-
1
EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports][:real_top_100_bn]).deliver
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "Report Delivered to email lists and copied to production"}
-
else
-
EnterpriseReports.send_report_count_error report_file_name, row_count, 100
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_file_name} Report NOT Delivered: row count off by 0.5% of more -----"}
-
end
-
-
1
report_csv.close
-
end
-
end
-
-
1
class AppleAllTopBooksReport
-
1
include EnterpriseReports
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string = Date.current.to_s)
-
warehouse_date_id = WarehouseDate.find_by(date: report_date_string.to_date).id
-
select_statement = 'warehouse_categories.name AS "List Name", list_type AS "List Type", rank AS "List Rank", author AS "Author", ' +
-
'title AS "Title", itunes_id AS "iTunes ID", trunc(price / 100.0, 2) AS "Price", warehouse_dates.date AS "List Rank Date"'
-
sql = WarehouseListStat.select(select_statement).joins(:warehouse_date, :warehouse_category).where{warehouse_category.category_type == 'AppleBookCategory'}.where(warehouse_date_id: warehouse_date_id).order('warehouse_categories.name, list_type DESC, rank').to_sql
-
sql_copy_to_csv_and_deliver_report(sql, :booklr, "apple-all-books-ranks-report-#{report_date_string.to_date.strftime("%m%d%y")}", nil, ftp: false, emailable_report_name: :apple_top_books, gzip: false)
-
end
-
end
-
end
-
end
-
1
module OneTimeReports
-
1
class Goodreads
-
1
include EnterpriseReports
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string, deliver_email = true)
-
1
report_date = report_date_string.to_date
-
1
warehouse_date_id = WarehouseDate.find_by(date: report_date).id
-
1
sql = WarehouseStat.single_query_join_and_select(%w[goodreads_work_average_rating goodreads_work_rating_count goodreads_work_review_count
-
goodreads_work_added_by_count goodreads_work_to_read_count goodreads_edition_average_rating
-
goodreads_edition_rating_count goodreads_edition_review_count goodreads_edition_added_by_count
-
goodreads_5_star_count goodreads_4_star_count goodreads_3_star_count goodreads_2_star_count goodreads_1_star_count],
-
{warehouse_book_version: %w[title book_format asin isbn13 bn_id author_name]}, nil).where(warehouse_region_id: WarehouseRegion.com.id, warehouse_date_id: warehouse_date_id).order(:warehouse_book_version_id).to_sql
-
-
1
sql_copy_to_csv_and_deliver_report(sql, :booklr, "goodreads-report-#{report_date.strftime("%m%d%y")}", WarehouseBookVersion.com.ingested.count, ftp: false, emailable_report_name: (deliver_email ? :goodreads : nil))
-
end
-
end
-
-
1
class PllHistoricAttributeReport
-
1
include EnterpriseReports
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform
-
asins = %w[B0058WCBOI B00G7J5NH8 B003JBI0QS B0067MSQEC B00CMLBK9U B0076LR1GW B00FO14UJM B007PLAVH4
-
B008PYM59C B007MF3NK0 B005DSA1T4 B0085AJQEI B00B0A5Y78 B00D3WHFHS B00GALGRJG B00DPN1SNW B00AOHDMFE
-
B00G75EQMA B00E5H5E3W B0051UBSLE B00C8324IS B0090U0J3Y B00DJUN2WG]
-
warehouse_book_version_ids = WarehouseBookVersion.where(asin: asins).value_of :id
-
columns = %w[amazon_price amazon_sales_rank amazon_review_count goodreads_work_rating_count]
-
sql = WarehouseStat.select('warehouse_book_versions.asin, warehouse_dates.date').select(columns).joins(:warehouse_date, :warehouse_book_version).where(warehouse_book_version_id: warehouse_book_version_ids).order('warehouse_dates.date desc').order(:warehouse_book_version_id).to_sql
-
res = WarehouseStat.connection.execute(sql)
-
csvs = columns.each_with_object({}) {|column, hash| hash[column] = File.open("./tmp/pll_historical_#{column}_report_#{Date.current.to_s}.csv", 'wb')}
-
current_asin = nil
-
csvs.values.each {|csv| csv << ",#{res.values.transpose[1].uniq.join(',')}\n"}
-
res.each do |row|
-
if row['asin'] != current_asin
-
if current_asin != nil
-
csvs.values.each {|csv| csv << "\n"}
-
end
-
csvs.values.each {|csv| csv << row['asin']}
-
current_asin = row['asin']
-
end
-
columns.each do |column|
-
value = column.include?('price') ? row[column].try(:to_i).try(:/, 100.0) : row[column]
-
csvs[column] << ",#{value}"
-
end
-
end
-
-
csvs.values.each do |csv|
-
csv.flush
-
csv.close
-
-
uploader = ReportUploader.new
-
uploader.client_name = :pll
-
uploader.store! File.new(csv.path)
-
end
-
end
-
end
-
-
1
class FishRichardsonDimensionReport
-
1
include EnterpriseReports
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string)
-
client_name = :booklr
-
report_date = report_date_string.to_date
-
warehouse_date_id = WarehouseDate.find_by(date: report_date).id
-
warehouse_region_id = WarehouseRegion.com.id
-
-
report_file_name = "fish-richardson-title-dimensions-report-#{Date.current.strftime("%m%d%y")}"
-
report_hash = EnterpriseReports.generate_report_hash report_file_name, client_name
-
report_csv = EnterpriseReports.open_csv(report_hash)
-
report_csv << ['ISBN', 'Title', 'Author', 'Format', 'Published Date', 'Length', 'Width', 'Depth', 'Similar Item Category 1', 'Similar Item Category 2', 'Similar Item Category 3']
-
-
warehouse_book_version_ids = WarehouseBookVersion.ingested.com.where{physical_details != nil}.where(book_format: ["Paperback", "Hardcover"]).where("physical_details LIKE '%inches%'").order(:id).value_of(:id)
-
-
(warehouse_book_version_ids.count / 1000 + 1).times do |count|
-
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Generating F&R report for #{(1000 * count)..(1000 * (count + 1) - 1)} out of #{warehouse_book_version_ids.count} book versions -----"}
-
-
sql = WarehouseStat.single_query_join_and_select('amazon_similar_item_category_tree_1, amazon_similar_item_category_tree_2, amazon_similar_item_category_tree_3',
-
{warehouse_book_version: %w[isbn13 title author_name book_format pub_date physical_details]},
-
nil).where{warehouse_stats.warehouse_book_version_id.in warehouse_book_version_ids[(1000 * count)..(1000 * (count + 1) - 1)]}.where(warehouse_date_id: warehouse_date_id, warehouse_region_id: warehouse_region_id).to_sql
-
-
ActiveRecord::Base.connection.execute(sql).chunk {|el| el['warehouse_book_version_id']}.each do |_, warehouse_stat_result|
-
dimensions = warehouse_stat_result.values.last(6).last
-
length = dimensions.split(" x ").first
-
width = dimensions.split(" x ").second
-
depth = dimensions.split(" x ").third.present? ? dimensions.split(" x ").third.split(" inches").first : "N/A"
-
row = warehouse_stat_result.values.last(6).first(5) + [length] + [width] + [depth] + warehouse_stat_result.values.first(3)
-
report_csv << row
-
end
-
end
-
-
report_csv.flush
-
EnterpriseReports.move_to_s3(client_name, report_csv)
-
end
-
end
-
end
-
1
module EnterpriseReports
-
1
module StatisticalReports
-
1
EXCLUDED_PUBLISHERS = ['12-Gauge Comics', '47North', '519 editores', 'ABC News', 'Abingdon Press', 'Abingdon Press Fiction', 'Accent Press', 'Accent Press Ltd', 'Ace', 'Adams Media', 'Aerie', 'Akashic Books', 'Akashic Noir Series', 'Akashic Urban Surreal Series', 'AKDigital', 'Alba Editorial', 'Albert Salvadó', 'Alfaguara', 'Algonquin Books', 'Allan Classics', 'Allison & Busby', 'Allison & Busby', 'Alpha', "Amazon Children's Publishing / Plympton", 'AmazonCrossing', 'AmazonEncore', 'Anchor', 'Anchor; 1st edition', 'Angry Robot', 'Aspect', 'Atida Press, The Olive Group', 'Atlantic Monthly Press', 'Atria', 'Atria Books', 'Atria Books; 1 Reprint edition', 'Atria Books; First edition', 'Atria Books; Reprint edition', 'Atria/Emily Bestler Books', 'Avon', 'Avon Impulse', 'Avon Inspire', 'B de Books', 'Back Bay Books', 'Baile del Sol', 'Ballantine', 'Ballantine Books', 'Ballantine Books; 1 edition', 'Ballantine Books; 1st Ballantine Books Domestic Ed edition', 'Ballantine Books; 1st edition', 'Ballantine Books; Ballantine Books ed edition', 'Ballantine Books; First edition', 'Ballantine Books; Original edition', 'Ballantine Books; Reissue edition', 'Ballantine Books; Reprint edition', 'Ballantine Group', 'Bantam', 'Bantam Books', 'Bantam Classics', 'Bantam Dell Pub Group, Westminster, Maryland, U.S.A.', 'Bantam Doubleday Dell', 'Bantam Fiction', 'Bantam; 1 edition', 'Bantam; 1st edition', 'Bantam; Original edition', 'Bantam; Reissue edition', 'Bantam; Reprint edition', 'Bantam; Revised edition', 'Bastei Luebbe', 'BBC Digital', 'Berkley', 'Berkley; 1 edition', 'Berkley; 1st edition', 'Berkley; Original edition', 'Berkley; Reprint edition', 'Bloomsbury', 'Bloomsbury Reader', 'Bloomsbury USA', 'BLOOMSBURY USA/WALKER', 'Blue Hen', 'Blue Rider Press', 'Broadway', 'Broadway; Reprint edition', 'Byliner Inc.', 'CAB,TROYA', 'Cash Money Content; Original edition', 'Center Street', 'ChiZine', 'CISNE', 'Cornerstone Digital', 'Corvallis Press', 'Crimeline', 'Crimeline; Reissue edition', 'Crimeline; Reprint edition', 'Crimeline; Revised edition', 'Crown', 'Crown; 1 edition', 'Dark Horse Comics', 'Daw', 'DEBOLS!LLO', 'Del Rey', 'Del Rey; 1 edition', 'Del Rey; Original edition', 'Delacorte Pr', 'Delacorte Press', 'Delacorte Press; 1 edition', 'Delacorte Press; 1st edition', 'Delacorte Press; First edition', 'Delacorte Press; Original edition', 'Delacorte Press; Tra edition', 'Delacorte Pubns Inc', 'Dell', 'Dell Publishing Company', 'Dell; 1 edition', 'Dell; 1st edition', 'Dell; Dell Mass Market ed edition', 'Dell; Original edition', 'Dell; Reissue edition', 'Dell; Reprint edition', 'Delta', "Delta; Oprah's Book Club edition", 'Delta; Original edition', 'Delta; Reprint edition', 'Diversion Books', 'Doubleday', 'Doubleday Canada', 'Doubleday; 1st edition', 'Dover Publications', 'DUTTON ADULT', 'Dutton Adult', 'Dutton Guilt Edged Mysteries', 'Ecco', 'Ediciones Siruela', 'Editorial Alrevés', 'Editorial Amarante', 'Editorial Autores de Argentina', 'Editorial Kattigara', 'Editorial La Tuerca', 'Editorial Medí', 'Editorial Medí', 'Editorial Sur', 'eLibros', 'Emblem Editions', 'Europa', 'Faber and Faber Crime', 'FaithWords', 'FaithWords; 1 edition', 'fallen leaves press (TM) and ignacio hills press (TM) IgnacioHillsPress.com', 'fallen leaves press (TM), e-Pulp Adventures (TM) and ignacio hills press (TM) IgnacioHillsPress.com', 'fallen leaves press (TM), ignacio hills press (TM), E-Pulp Adventures (TM)', 'Farrar, Straus and Giroux', 'Farrar, Straus and Giroux; 1 edition', 'Fawcett', 'Fawcett; Reprint edition', 'Fawcett; Revised edition', 'Forever', 'Forever Yours', 'Forge Books', 'Forge Books; 1 edition', 'Francesco Libri', 'Free Press', 'FSG Originals', 'Gallery Books', 'Grand Central Publishing', 'Grand Central Publishing; 1 edition', 'Grand Central Publishing; 1st edition', 'Grand Central Publishing; First edition', 'Grand Central Publishing; Reprint edition', 'Graywolf Press', 'GRIJALBO', 'Grove Press', 'Grove Press, Black Cat', 'Grove/Atlantic', 'Grupo Nelson', 'Hachette Digital', 'Hackett Publishing Co.', 'Halcyon Press Ltd.', 'Hard Case Crime', 'Harlequin', 'Harlequin Anthology', 'Harlequin Historical', 'Harlequin Ibérica, S.A.', 'Harlequin Intrigue', 'Harlequin Medical Romance', 'Harlequin MIRA', 'Harlequin Special Releases', 'Harlequin Treasury-Harlequin Intrigue 90s', 'Harlequin Treasury-Silhouette Special Edition 90s', 'Harper', 'Harper Design', 'Harper Paperbacks', 'Harper Perennial', 'Harper Perennial; Original edition', 'Harper Perennial; Reprint edition', 'Harper Voyager', 'Harper; Original edition', 'Harper; Reprint edition', 'HarperCollins', 'HarperCollins Canada', 'HarperCollins e-books', 'HarperCollins e-books; 1 edition', 'HarperCollins e-books; 1 Reissue edition', 'HarperCollins e-books; 1 Reprint edition', 'HarperCollins e-books; 1st edition', 'HarperCollins e-books; 256 edition', 'HarperCollins e-books; Mti Rep edition', 'HarperCollins e-books; Org Mti edition', 'HarperCollins e-books; Original edition', 'HarperCollins e-books; Reissue edition', 'HarperCollins e-books; Repack edition', 'HarperCollins e-books; Reprint edition', 'HarperCollins ebooks', 'HarperPerennial Classics', 'HarperPress', 'Harvard University Press', 'Headline', 'Headline Books, Inc.', 'Henry Holt and Co.', 'Hogarth', 'Holt Paperbacks', 'Houghton Mifflin Harcourt', 'Howard Books', 'Hyperion', 'Hyperion e-books', 'Ian Fleming Publications', 'ignacio hills press (TM) IgnacioHillsPress.com', 'ignacio hills press (TM) IgnacioHillsPress.com and e-Pulp Adventures (TM)', 'Il Gatto e la Luna', 'Il leone verde Edizioni', 'Image', 'InterMix', 'Island Books', 'Island Fiction', 'Ivy Books', 'Ivy Books; 1st Ballantine Books Ed edition', 'Ivy Books; 1st edition', 'Ivy Books; Reprint edition', 'Jordán Dorado', 'Jordi Díez', 'Jove', 'Kensington', 'Kensington Books', 'Knopf', 'Knopf Canada', 'Knopf Group E-Books', 'La Esfera de los Libros', 'La Factoría de Ideas', 'La Factoría de Ideas', 'La Flor del Itapebí', 'Lübbe Digital', 'Laboratorio Editorial TusRelatos SL', 'Little, Brown and Company', 'Little, Brown and Company; 1 edition', 'Little, Brown and Company; 1st edition', 'Loveswept', 'LUMEN', 'Macmillan', 'Macmillan Australia', 'Macmillan New Writing', 'Macmillan UK', 'McClelland & Stewart', 'McClelland & Stewart', 'Metropolitan Books', 'Midnight Ink', 'Minotaur Books', 'Minotaur Books; 1 edition', 'Minotaur Books; Reprint edition', 'Mira', 'Mira; 1 edition', 'Mira; 1 Original edition', 'Mira; Original edition', 'Modern Library', 'Modern Library; Modern Library edition', 'Modern Library; Modern Library Pbk. Ed edition', 'MONDADORI', 'Montlake Romance', 'MTV Books', 'Mulholland Books', 'Mulholland Books; 1 edition', 'Mulholland Books; Reprint edition', 'Multnomah Books', 'Mysterious Press', 'MysteriousPress.com/Open Road', 'NAL', 'Nan A. Talese', 'Newmarket Press', 'North Point Press', 'NYRB Classics', 'Oceanview Publishing', 'One World/Ballantine', 'One World/Ballantine; 1st edition', 'One World/Ballantine; Reprint edition', 'One World/Strivers Row', 'Onyx', 'Open Road', 'Open Road E-riginal', 'Open Road Iconic Ebooks', 'Orb Books', 'Orbit', 'Ordóñez Díaz, Olegario / Ediciones Cátedra Pedagógica', 'Originally published by Avon Books in 1996', 'Orion', 'Overlook', 'Oxford University Press', 'Oxford University Press, UK', 'Oxford University Press, USA', 'Palgrave Macmillan', 'Pan', 'Pan Books', 'Pantheon', 'Penguin', 'Penguin Books', 'Penguin Classic', 'Penguin Non-Classics', 'Perigee', 'Phoenix', 'Picador', 'Picador; 1 edition', 'Pinnacle Books', 'Pintail', 'PLAZA & JANES', 'PLAZA & JANES', 'PLAZA Y JANES', 'Plume', 'Pocket Books', 'POCKET BOOKS (SIMO)', 'Pocket Books/Star Trek', 'Pocket Star', 'Pocket Star; Original edition', 'Poisoned Pen Press', 'Presidio Press', 'Princeton University Press', 'Putnam Adult', 'Random House', 'Random House Australia', 'Random House Publishing Group', 'Random House Trade Paperbacks', 'Random House Trade Paperbacks; 1 edition', 'Random House Trade Paperbacks; Original edition', 'Random House Trade Paperbacks; Reprint edition', 'Random House, Inc.', 'Random House; 1 edition', 'Random House; 1st edition', 'Random House; 1st Unabridged edition', 'Random House; Book Club edition', 'Rayo', 'Reagan Arthur / Back Bay Books', 'Reagan Arthur Books', 'Reagan Arthur Books; 1 edition', 'Reagan Arthur Books; Special edition', 'Revell', 'Riverhead', 'Roc', 'ROSA VENTS', 'RosettaBooks', 'Rough Guides', 'Sarah Crichton Books', 'Scribner', 'Severn House Digital', 'Severn House Digital; Reprint edition', 'Severn Press', 'Severn Select', 'Signet', 'Signet Classics', 'Silhouette Bombshell', 'Silhouette Desire', 'Silhouette Intimate Moments', 'Silhouette Nocturne', 'Silhouette Romantic Suspense', 'Silhouette Special Releases', 'Simon & Schuster', 'Simon & Schuster; 1 edition', 'Simon & Schuster; 1 Reprint edition', 'Simon & Schuster; 1st edition', 'Simon & Schuster; 1st Simon & Schuster Hardcover Ed edition', 'Simon & Schuster; 1st Simon & Schuster Pbk. Ed edition', 'Simon & Schuster; Original edition', 'Simon & Schuster; Reprint edition', 'Simon & Schuster', 'Simon & Schuster Audio', 'Simon & Schuster UK', 'Simon Pulse', 'Soft Skull Press', 'Soho Constable', 'Soho Crime', 'Soho Press', 'Sourcebooks Casablanca', 'Sourcebooks Landmark', 'Spectra', 'Spectra; Original edition', 'Spectra; Reissue edition', 'Spectra; Reprint edition', 'Spiegel & Grau', 'Spiegel & Grau; 1 edition', 'Spiegel & Grau', "St. Martin's Dead Letter", "St. Martin's Griffin", "St. Martin's Griffin; 1 edition", "St. Martin's Griffin; Original edition", "St. Martin's Paperbacks", "St. Martin's Paperbacks; 1 edition", "St. Martin's Press", "St. Martin's Press; 1 edition", "St. Martin's Press; First Edition edition", "St. Martin's Press; Reprint edition", 'Strebor Books', 'SUDAMERICANA', 'Suma de letras', 'Tarcher', 'The Dial Press', 'The Dial Press; 1 edition', 'The Dial Press; 1st edition', 'The Penguin Press', 'Thomas & Mercer', 'Thomas & Mercer', 'Thomas & Mercer / Plympton', 'Thomas Dunne Books', 'Thomas Nelson', 'Thomas Nelson Publishers', 'Threshold Editions', 'TIME WARNER PAPERBAC', 'Titan Books', 'Tor Books', 'Tor Classics', 'Tor Fantasy', 'Touchstone', 'Touchstone; Original edition', 'Touchwood Editions', 'Transworld Digital', 'Twelve', 'Tyndale House Publishers, Inc.', 'Tyrus Books', 'University of Chicago Press', 'University of New Mexico Press', 'University of Wisconsin Press', 'University Press of Mississippi', 'University Press of New England', 'Vanguard Press', 'Vida', 'VIKING ADULT', 'Villard', 'Villard; 1st edition', 'Vintage', 'Vintage Canada', 'Vintage Digital', 'Vintage; 1 edition', 'Vintage; Reprint edition', 'Vision', 'W. W. Norton & Company', 'W. W. Norton & Company', 'Walk Worthy Press', 'Walker Books', 'Walker Childrens', 'Washington Square Press', 'WaterBrook Press', 'William Morrow', 'William Morrow Paperbacks', 'William Morrow Paperbacks; 1 edition', 'William Morrow Paperbacks; Masterpiece ed edition', 'William Morrow Paperbacks; Open market ed edition', 'William Morrow Paperbacks; Original edition', 'William Morrow Paperbacks; Reissue edition', 'William Morrow Paperbacks; Reprint edition', 'William Morrow; 1 edition', 'William Morrow; Reprint edition', 'Windblown Media', 'Zondervan', 'David C. Cook', 'Bethany House', 'Bethany House Publishers', 'Bello', 'Chronicle Books', 'Melville International Crime', 'Steeple Hill Love Inspired Suspense', 'Steeple Hill Single Title', 'Bitter Lemon Press', 'Bold Strokes Books', 'd', 'Pegasus Books', 'Princeton Halls Press', 'Library Tales Publishing', 'Medallion Press', 'Mantle', 'Speck Press', 'Silhouette Athena Force', 'Acacia Publishing, Inc.', 'AAA Reality Games', 'Addison Moore', 'Addison Moore Publishing', 'Aladdin', 'Alfaguara Juvenil', 'Alloy Entertainment', 'AMACOM', "Amazon Children's Publishing", 'Amazon Publishing', 'Amazon.com', 'American Cancer Society', 'American Girl', 'American Psychological Association', 'AMG Publishers', 'Amulet Books', 'Amulet Books; 1st edition', 'Annick Press', 'Archie Comics', 'Arthur A. Levine Books', 'Astraea Press', 'Atheneum', 'Atheneum Books for Young Readers', 'Atlantic Publishing Group Inc.', 'Bancroft Press', 'Barbour Books', 'Beaufort Books', 'Bell Bridge Books', 'Berlinica Publishing LLC', 'Bluewood Books', 'Book Peddlers, The', 'Darby Creek', 'Michael Wiese Productions', "NYR Children's Collection", 'Platypus Press', 'Prufrock Press', 'Red Iris Books', 'Running Press Kids', 'Sandcastle Publishing LLC', 'WiDo Publishing', "Writer's Digest Books", 'Writers Digest Books', "Barron's Educational Series", 'BenBella Books', 'Bloomsbury Childrens', 'Bloomsbury Publishing', 'Bloomsbury USA Childrens', 'Body and Soul Publishing', 'Candlewick', 'Candlewish DRM-Free', 'Canterbury House Publishing', 'Carolrhoda Lab TM', 'Carpe Luna Publishing', 'Carpe Luna, Ltd.', 'Chicago Review Pr', 'Chicago Review Press', 'Chicken Soup for the Soul', 'Coliloquy, LLC', 'Cooper Square Publishing Llc', 'DC Comics', 'Disney', 'Disney Hyperion', 'Disney Hyperion; 1 edition', 'Echelon Press', 'ECW Press', 'Edic', 'Ediciones Selectas Diamante', 'Educational Game Books/Anti-Aging Press, Inc.', 'EgmontUSA', 'Entangled Teen', 'Evolved Publishing', 'Fairview Press', 'Flux', 'Free Spirirt Publishing', 'Free Spirit Publishing', 'Gibbering Gnome Press, A Division of Ingenious Inventions Run Amok, Ink', 'Gibbs Smith', 'Gibbs-Smith', 'Grace Publishing', 'Graphia', 'Hachette Digital ', 'Hachette India', 'Hampton Roads Pub Co', "Harcourt Children's Books", 'Harlequin Teen', 'HarperCollins; 1 edition', 'HarperCollins; 1st Avon Ed edition', 'HarperCollins; Reprint edition', "HarperCollinsChildren'sBooks", 'HarperTeen', 'HarperTeen; 1 edition', 'HarperTeen; 1st Avon Ed edition', 'Harvest House Publishers', 'Haunted Computer Books', 'Hay House', 'HCI', 'HCI Teens', 'Health Communications', 'HJ Kramer/New World Library', 'HMH', 'HMH Books', "Hodder Children's Books", 'Houghton Mifflin', 'Houghton Mifflin Books for Children', 'Houghton Mifflin Harcourt; 1 edition', 'Instant Help', 'Jessica Kingsley Publishers', 'Jossey-Bass', 'Kensington Publishing Corp', 'Kimani TRU', 'Kirkdale Press', 'Kregel Publications', 'Lift Every Voice', 'Little, Brown Books for Young Readers', 'Living Ink Books, an imprint of AMG Publishers', 'Llewellyn Publications', "Macmillan Children's Books", 'Marcher Lord Press', 'Marcus Institute of Commercial Modeling', 'Margaret K. McElderry Books', 'Marvel', 'Middlebury House Publishing', 'Moody Publishers', 'Musa Publishing', "National Geographic Children's Books", 'Navpress', 'NavPress Publishing Group', 'Open Road Young Readers', 'Orchard Books', "Orion Children's", 'Pelican Publishing Company', 'Point', 'Point; 1 edition', 'Prometheus Books', 'Puffin', 'Pyr', 'Quirk Books', 'Ragz Books', 'Random House Books for Young Readers', 'Raven Publishing, Inc. of Montana', 'Reprints', 'RHCB Digital', 'RHCP Digital', 'Roca Juvenil', 'Scholastic Inc.', 'Scholastic Paperbacks', 'Scholastic Press', 'Shadow Mountain', 'Simon & Schuster Books for Young Readers', 'Simon & Schuster Books for Young Readers; 1 edition', 'Simon & Schuster Books for Young Readers; 1 Reprint edition', 'Simon & Schuster Books for Young Readers; 1st Simon Pulse Ed edition', 'Simon & Schuster Books for Young Readers; Original edition', 'Simon & Schuster Books for Young Readers; Reprint edition', 'Simon & Schuster/Paula Wiseman Books; 1 edition', 'Simon & Schuster Books for Young Readers', 'Simon Pulse/Mercury Ink', 'Simon Pulse/Mercury Ink; 1 edition', 'Simon Spotlight', 'Sky Pony Press', 'Sourcebooks Fire', 'Sourcebooks Jabberwocky', 'Spencer Hill Press', 'StoneHouse Ink', 'Tanglewood Press', 'Taylor & Francis', 'The Chicken House', 'Tor Teen', 'Tuttle Publishing', 'Tyndale House Publishers', 'Tyndale Kids', 'University of Nebraska Press', 'Vanguard Management', 'Vanguard Management, Inc.', 'Wizards of the Coast', 'Workman Publishing Company', 'Yale University Press', 'Zest Books', 'Collins Educational', 'Delacorte Books for Young Readers', 'Laurel Leaf', 'Zonderkidz', 'Knopf Books for Young Readers', 'Soho Teen', 'Simon & Schuster/Paula Wiseman Books', 'Wiley', 'Tor', 'Enslow Publishers, Inc.', 'Facts on File', 'Facts on File (J)', 'Wiley-Blackwell', 'Alfaguara Infantil', 'Allen & Unwin', 'Almadraba Editorial', 'Almadraba Infantil y Juvenil', 'Ambush Books', 'David Fickling Books', 'Schwartz & Wade', 'Razorbill', 'Signet Classic', 'Greenwillow Books', 'Collins', 'HarperFestival', 'HarperCollins; Revised edition', 'The Friday Project', 'HarperTeen; Reprint edition', 'Yen Press', 'MONTENA', 'Aladdin/Beyond Words', 'Simon & Schuster Books for Young Readers; 1st edition', 'Simon Pulse; Reprint edition', 'Dundurn', 'HarperCollins; 1 Reprint edition', 'Harper Element', 'Scholastic Non Fiction', 'Scholastic Non-Fiction', 'Allen & Unwin', 'Springer Publishing Company', 'Rutgers University Press', 'Routledge', 'Scarecrow Press', 'Prestwick House, Inc.', 'Pook Press', 'Pelican Publishing', 'Academic Group Publishing - BRAII', 'Andrews McMeel Publishing LLC', 'AudioGO', 'Bailiwick Press', 'Baker Book House Company', 'Basic Books', "BBC Children's Books", 'Book View Cafe', 'Carolrhoda Books', 'Cedar Fort, Inc.', 'Chelsea House', 'Clarion Books', 'Conari Press', 'Disney Press', 'Edições Paulinas -São Paulo-Brazil', 'Ediciones B Mexico S.A. de C.V.', 'Egmont', 'Faber and Faber Plays', "Frances Lincoln Children's Books", 'Galaxy Press', 'Gospel Light', 'Greenwood', 'Groundwood Books', 'Harmony Ink Press', 'Hardie Grant Egmont', 'HarperCollins; 1st Avon ed edition', 'Jessica Kingsley', 'Jewish Lights Publishing', 'John Wiley & Sons, Inc.', 'Kaplan Test Prep', 'La factorÃa de ideas', 'Lechner Syndications', 'Libraries Unlimited', 'M P Publishing Limited', 'McFarland', 'McGraw-Hill', 'Mirrorstone', 'Minnesota Historical Society Press', 'New Horizon Press', 'Nickelodeon Publishing', 'Nomad Press', 'Orca Book Publishers', 'Patria Press', 'Push', 'Regal', 'Roberts Rinehart', 'Roca Editorial', 'Running Press', 'Sandpiper', 'Simon & Schuster Books For Young Readers', 'Smart Pop', 'Sourcebooks', 'St. Paul Press', 'T & F Books UK', 'The Jewish Publication Society', 'Torch Legacy Publications', 'Ulysses Press', 'Univ Of Minnesota Press', 'University of California Press', 'Walter Foster', 'William Gladden Foundation Press', 'Zion Christian Publishers', 'Alfred A. Knopf', 'Amistad', 'Amistad; 1 edition', 'Authonomy', 'Avery', 'Back Stage Books', 'Baker Books', 'Balzer + Bray', 'Balzer + Bray; 1 edition', 'Bantam Books for Young Readers', 'Beach Lane Books', 'Bindu Books', 'Bloomberg Press', 'Bluefire', 'Cambridge University Press', 'Candle Books', 'Candlewick DRM-Free', 'Candlewick; Reprint edition', 'Celestial Arts', 'Chamberton Publishing', 'Chelsea House Pub (L)', 'Chelsea House Publications', 'Chronicle Books LLC', 'Cinco Puntos Press', 'Collins Voyager', 'Compass Press', 'Crossroad Press', 'Crown Archetype', 'Crushing Hearts and Black Butterfly Publishing', 'Curiosity Quills Press', 'Curtis Brown Digital', 'Dafina', 'Decadent Publishing Company', 'Dell Books for Young Readers', 'Desert Breeze Publishing, Inc', 'Devine Destinies', 'DIAL', 'DK Publishing', 'Doubleday Books for Young Readers', "DUTTON CHILDREN'S", 'e-penguin', 'Ebury Digital', 'Echelon Press LLC', 'Eerdmans Books for Young Readers', 'Eloquent Books', 'Ember', 'Entangled Publishing', 'ePenguin', 'Etopia Press', 'Evernight Publishing', "Faber and Faber Children's Books", 'Family Audio Library', 'Family Psychological Press', 'Farrar, Straus and Giroux (BYR)', 'Farrar, Strauss & Giroux', 'Featherweight Publishing', 'Feiwel & Friends', 'Feiwel & Friends', 'Firebird', 'First Second', 'Flash Point', 'Fourth Estate', 'Fremantle Press', 'Gallery', 'Gibbs Smith, Publisher', 'Golden Books', 'Gotham Books', 'Greenwillow Books; 1 edition', 'Greenwillow Books; 1st edition', 'Grosset & Dunlap', 'Grosset & Dunlap', 'Gypsy Shadow Publishing', 'Hampton Roads Publishing', 'Harlequin Blaze', 'Harlequin HQN', 'Harlequin Nocturne', 'HarperCollins; 1st edition', 'HarperOne', 'HarperTeen; Revised edition', 'HarperTrue', 'Henry Holt and Co. (BYR)', 'Holiday House', 'Houghton Mifflin Books for Children; Reprint edition', 'Houghton Mifflin Harcourt; Reprint edition', 'HQN Books', 'Image Cascade Publishing', 'ImaJinn Books, Inc', 'Inkspell Publishing', 'Itoh Press', 'K-Teen/Dafina', 'Katherine Tegen Books', 'Katherine Tegen Books; 1 edition', 'Kelpies', 'Lands Atlantic Publishing', 'Leap Books', 'Limitless Publishing LLC', "Lion Children's", 'LITTLE BROWN BKS YOUNG READERS', 'Little, Brown', 'Little, Brown Books for Young Readers; 1 edition', 'Little, Brown Books for Young Readers; 1st edition', 'Lovestruck Literary', 'Lyrical Press, Inc.', 'Martin Sisters Publishing LLC', 'Merit Press', 'Merriam-Webster, Inc.', 'Modern Library; Tra edition', 'Montena', 'MP Publishing Limited', 'MTV Books; Original edition', 'Mundania Press LLC', 'MuseItUp Publishing', 'Myrddin Publishing Group', 'Neeland Media LLC', 'New Canadian Library', 'Noble Romance Publishing, LLC', 'Noble Young Adult - Not Just Romance!', 'NPC Books', 'NUBE DE TINTA', 'OakTara', 'One World/Ballantine; 1 edition', 'Open Road Media', 'Open Road Media Young Readers', 'Outskirts Press', 'Outskirts Press, Inc.', 'Papercutz', 'Penguin Young Readers', 'Persea', 'PHILOMEL', 'Pocket Star; Reprint edition', 'Poppy', 'Potter Craft', 'Prizm Books', 'PSS Juvenile', 'Putnam Juvenile', 'Queerteen Press', 'Random House BFYR', 'Rhemalda Publishing', 'Ripley Publishing', 'Roaring Brook Press', 'Samhain Publishing, Ltd.', 'Scholastic Fiction', 'Scholastic Press; 1 edition', 'Schwartz & Wade', 'Screech Owls', 'Secret Cravings Publishing', 'Simon & Schuster Books for Young Readers; New title edition', 'Simon & Schuster Books for Young Readers; Reissue edition', "Simon & Schuster Children's Publishing", 'Simon & Schuster/Paula Wiseman Books', 'Simon Pulse; 1 edition', 'Simon Pulse; Original edition', 'Simon Pulse/Beyond Words', 'Siren Publishing', 'Skylark', 'Sleeping Bear Press', 'Solstice Publishing', 'Speak', 'Square Fish', 'Starscape', 'Storey Publishing, LLC', 'Ten Speed Press', "The O'Brien Press", 'Three Rivers Press', 'Top Shelf Productions', 'Townsend Press', 'Trafford Publishing', 'Tricycle Press', 'Tundra Books', 'Turquoise Morning Press', 'University Of Chicago Press', 'University of Minnesota Press', 'University of Nevada Press', 'University of Pennsylvania Press', 'University of Queensland Press', "Viking Children's", 'Viking Juvenile', 'Walden Pond Press', 'Watson-Guptill', 'Weinstein Books', 'Wendy Lamb Books', 'Wheatmark', 'Whiskey Creek Press LLC', 'Wild Child Publishing', 'Wm. B. Eerdmans Publishing Company', 'World Castle Publishing', 'Yearling', 'Young Picador', 'Zebra', 'Zebra Books', 'Zeta Comics', 'Zondervan/Youth Specialties', 'Faber and Faber Fiction', 'Mercury Ink', 'Hodder', 'Penguin Classics', 'PUTNAM', 'Gallery Books/G-Unit', 'Broadway Books', 'Viking Adult', 'Redhook', 'Crown Forum', 'Dial Press Trade Paperback', 'Dreamspinner Press', 'DAW', 'Faber and Faber Non Fiction', 'Faber and Faber Poetry', 'Hodder & Stoughton', 'LucasBooks', 'Simon & Schuster; Rep Una edition', 'Putnam', "G.P. Putnam's Sons", 'Review', 'HarperCollins e-books; Ecco edition', 'Winepress Publishing', 'HarperCollins Entertainment', 'Blue Door', 'HarperCollins; Mti edition', 'Faber Finds', 'Voyager'].freeze
-
1
EXCLUDED_SOLD_BY = ['Hachette Book Group','HarperCollins Publishers','HarperCollins Publishing','Macmillan','Penguin Publishing','Random House Digital, Inc.','Random House Mondadori','Simon and Schuster Digital Sales Inc'].freeze
-
-
1
class AsinOnlyIngestedTitlesReport
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform
-
1
client_name = :booklr
-
1
report_hash = EnterpriseReports.generate_report_hash("asin-only-report-#{Date.current.strftime("%m%d%y")}", client_name)
-
1
csv = EnterpriseReports.open_csv(report_hash)
-
-
1
csv << ['ASIN', 'TLD', 'Title', 'Author', 'Publisher', 'Format', 'Published Date', 'Page Count', 'Amazon Link', 'Matched ISBN13', 'Matched BN ID', 'No Match Found']
-
-
1
warehousebook_version_ids = WarehouseBookVersion.where("isbn13 is null and asin is not null and bn_id is null").ingested.order(:source).value_of :id
-
-
1
(warehousebook_version_ids.count / 1000 + 1).times do |x|
-
1
WarehouseBookVersion.select([:id, :tld, :asin, :title, :author_name, :publisher, :book_format, :pub_date, :pages]).where(id: warehousebook_version_ids[(1000 * x)..(1000 * (x + 1) - 1)], status: "ingested").order(:source).each do |warehouse_book_version|
-
1
csv << [warehouse_book_version.asin, warehouse_book_version.tld, warehouse_book_version.title, warehouse_book_version.author_name, warehouse_book_version.publisher, warehouse_book_version.book_format, warehouse_book_version.pub_date, warehouse_book_version.pages, warehouse_book_version.amazon_url]
-
end
-
end
-
-
1
csv.flush
-
-
1
EnterpriseReports.move_to_s3(client_name, csv)
-
1
csv.close
-
end
-
end
-
-
1
class Identification
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string, amazon_average_rating, amazon_review_count, days_of_data_min, min_page_count, add_extra_columns, report_name, categories)
-
10
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_name.to_s.camelcase} Identification Report -----"}
-
-
5
report_date = report_date_string.to_date
-
5
warehouse_date_id = WarehouseDate.find_by(date: report_date).id
-
5
warehouse_region_id = WarehouseRegion.com.id
-
5
creation_date_minimum = (Date.current - days_of_data_min.days).in_time_zone.to_s
-
5
starting_warehouse_date_id = WarehouseDate.find_by(date: report_date - 89.days).id
-
5
warehouse_dates = WarehouseDate.where(id: starting_warehouse_date_id..warehouse_date_id)
-
5
warehouse_date_ids = WarehouseDate.where(id: starting_warehouse_date_id..warehouse_date_id).value_of(:id)
-
5
warehouse_categories = WarehouseCategory.amazon
-
5
client_name = :booklr
-
5
client_config = AmazeBot.config[:reports][:clients][client_name]
-
-
10
warehouse_book_versions = WarehouseBookVersion.where{(warehouse_stats.warehouse_date_id == warehouse_date_id) & (warehouse_stats.warehouse_region_id == warehouse_region_id)}.where("warehouse_stats.amazon_average_rating > #{amazon_average_rating} and warehouse_stats.amazon_review_count > #{amazon_review_count}").where('warehouse_book_versions.created_at < ?', creation_date_minimum).where{
-
20
(warehouse_stats.amazon_similar_item_category_tree_1.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_2.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_3.like_any my{categories}) |
-
20
(warehouse_stats.amazon_similar_item_category_tree_4.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_5.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_6.like_any my{categories}) |
-
20
(warehouse_stats.amazon_similar_item_category_tree_7.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_8.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_9.like_any my{categories}) |
-
20
(warehouse_stats.amazon_similar_item_category_tree_10.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_11.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_12.like_any my{categories}) |
-
25
(warehouse_stats.amazon_similar_item_category_tree_13.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_14.like_any my{categories}) | (warehouse_stats.amazon_similar_item_category_tree_15.like_any my{categories})}.joins(:warehouse_stats).to_a
-
-
# Exclude publishers and get uniq list
-
5
warehouse_book_versions = warehouse_book_versions.reject {|warehouse_book_version| EnterpriseReports::StatisticalReports::EXCLUDED_PUBLISHERS.include? warehouse_book_version.publisher}
-
5
warehouse_book_versions = warehouse_book_versions.reject {|warehouse_book_version| EnterpriseReports::StatisticalReports::EXCLUDED_SOLD_BY.include? warehouse_book_version.sold_by}
-
5
warehouse_book_versions = warehouse_book_versions.reject {|warehouse_book_version| warehouse_book_version.pages.to_i < min_page_count} if min_page_count
-
5
warehouse_book_version_ids = warehouse_book_versions.collect(&:id).uniq
-
-
10
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_name.to_s.camelcase} Identification Complete, report generation starting -----"}
-
-
5
report_hash = EnterpriseReports.generate_report_hash("#{report_name}-identification-report-#{report_date.strftime("%m%d%y")}", client_name)
-
5
report_csv = EnterpriseReports.open_csv(report_hash)
-
-
# Append header row
-
5
header = ['Title', 'Author', 'ASIN', 'ISBN', 'Pub Date', 'Publisher']
-
5
header += ['Sold By'] if add_extra_columns
-
5
header += ['Page Count', 'Days of Data', 'Created Date', 'Total Days Since Creation' '90 Day Average Overall Rank', '30 Day Moving Average',
-
'7 Day Moving Average', 'Trendline Growth %', 'R-Squared', '90 Day Overall Rank Growth Rate']
-
5
header += ['90 Day Overall Rank Volatility', 'Apple Number of Ratings', 'BN number of Ratings'] if add_extra_columns
-
5
header += ['Amazon Number of Likes', 'Amazon Number of Ratings']
-
5
header += ['Ratings Per Day Since Published', 'Reviews Per Day Over Last 90 Days'] if add_extra_columns
-
5
header += ['Average Star Rating', '% of Ratings 4 or above', '#1 Similar Category', '#2 Similar Category', '#3 Similar Category',
-
'#1 Sub Category', '#1 Sub Category Percentage', '#2 Sub Category', '#2 Sub Category Percentage', '#3 Sub Category',
-
'#3 Sub Category Percentage', 'Current Sales Rank', 'Current Price', 'Product URL', 'Amazon Description']
-
5
report_csv << header
-
-
5
block_size = 20
-
5
(warehouse_book_version_ids.count / block_size + 1).times do |count|
-
10
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Generating #{report_name.to_s.camelcase} report for #{(block_size * count)..(block_size * (count + 1) - 1)} out of #{warehouse_book_version_ids.count} book versions at #{Time.current}-----"}
-
-
WarehouseStat.select(WarehouseStat::WAREHOUSE_STAT_FIELDS + WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS).where(
-
5
warehouse_book_version_id: warehouse_book_version_ids[(block_size * count)..(block_size * (count + 1) - 1)]).where(
-
warehouse_date_id: warehouse_date_ids, warehouse_region_id: warehouse_region_id).order(:warehouse_book_version_id, :warehouse_date_id).chunk{
-
5
|el| el['warehouse_book_version_id']}.each do |warehouse_book_version_id, warehouse_stats|
-
-
warehouse_book_version = warehouse_book_versions.select {|warehouse_book_version| warehouse_book_version.id == warehouse_book_version_id}.first
-
if warehouse_stats.collect(&:amazon_sales_rank).compact.blank?
-
Rails.logger.tagged('enterprise') {Rails.logger.info "skipping warehouse_book_version #{warehouse_book_version.id}, no sales rank data"}
-
next
-
elsif warehouse_stats.count < days_of_data_min
-
Rails.logger.tagged('enterprise') {Rails.logger.info "skipping warehouse_book_version #{warehouse_book_version.id}, less than #{days_of_data_min} days of data"}
-
next
-
end
-
-
stat_count = warehouse_stats.count
-
first_stat = warehouse_stats.first
-
last_stat = warehouse_stats.last
-
created_date = WarehouseBookVersion.find_by(asin: warehouse_book_version.asin).created_at.to_date
-
total_days = (Date.current - created_date).to_i
-
-
first_stat_date = warehouse_dates.select{|warehouse_date| warehouse_date.id == first_stat.warehouse_date_id}.first.date
-
last_stat_date = warehouse_dates.select{|warehouse_date| warehouse_date.id == last_stat.warehouse_date_id}.first.date
-
row = [warehouse_book_version.title, warehouse_book_version.author_name, warehouse_book_version.asin, warehouse_book_version.isbn13,
-
warehouse_book_version.pub_date, warehouse_book_version.publisher]
-
row << warehouse_book_version.sold_by if add_extra_columns
-
row += [warehouse_book_version.pages, stat_count, created_date.to_s, total_days]
-
-
# Regression Setup
-
amazon_sales_ranks = warehouse_stats.collect(&:amazon_sales_rank).compact
-
amazon_sales_ranks_x_values = (1..amazon_sales_ranks.count).to_a
-
line_fit = LineFit.new
-
valid = line_fit.setData(amazon_sales_ranks_x_values, amazon_sales_ranks)
-
-
row << amazon_sales_ranks.mean
-
row << amazon_sales_ranks.moving_average(30).last
-
row << amazon_sales_ranks.moving_average(7).last
-
row << (valid ? ((line_fit.forecast(1) - line_fit.forecast(amazon_sales_ranks_x_values.count)) / (line_fit.forecast(1).abs) * 100).round(2).to_s + "%" : nil)
-
row << (valid ? line_fit.rSquared.round(3) : nil)
-
row << Formulas.average_growth_rate(first_stat.amazon_sales_rank, last_stat.amazon_sales_rank, last_stat_date - first_stat_date, :negative)
-
-
-
row += [amazon_sales_ranks.standard_deviation, last_stat.itunes_rating_count, last_stat.bn_review_count] if add_extra_columns
-
row += [last_stat.amazon_likes, last_stat.amazon_review_count]
-
if add_extra_columns
-
row << (last_stat.amazon_review_count / (report_date - warehouse_book_version.pub_date.to_date).to_f if last_stat.amazon_review_count.present? && warehouse_book_version.pub_date.present?)
-
row << ((last_stat.amazon_review_count - first_stat.amazon_review_count) / stat_count.to_f if first_stat.amazon_review_count.present? && last_stat.amazon_review_count.present?)
-
end
-
-
row << last_stat.amazon_average_rating
-
if last_stat.amazon_review_count.present? && (last_stat.five_star_count.present? || last_stat.four_star_count.present?)
-
top_count = (last_stat.five_star_count || 0) + (last_stat.four_star_count || 0)
-
row << ((top_count.to_f / last_stat.amazon_review_count) * 100).to_s + "%"
-
else
-
row << '0%'
-
end
-
-
# Top Similar Item Categories
-
b = Hash.new(0)
-
warehouse_stats.compact.map {|warehouse_stat| WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS.map {|key| warehouse_stat.send(key)}}.flatten.compact.map {|value| b[value] +=1}
-
values = b.sort_by{|key, value| value}.reverse.first(3).flatten.reject {|value| value.is_a?(Integer)}
-
row += EnterpriseReports.pad_serialized_data(values,3) {|value| value}
-
-
# Top Sub Categories
-
b = Hash.new(0)
-
warehouse_stats.map {|warehouse_stat| warehouse_stat.attributes.keys.select{|key| key.include?('amazon_category') && key.include?('id')}.map {|key| warehouse_stat.send(key)}}.flatten.compact.map {|value| b[value] +=1}
-
sorted_values = b.sort_by{|k, v| v}.reverse.first(3)
-
sorted_values = sorted_values.map {|x,y| [warehouse_categories.select{|c| c.id == x}.first.name, y]}
-
row += EnterpriseReports.pad_serialized_data(sorted_values,6) {|value| value.flatten.each_with_index.map {|value,i| i.odd? ? (value.to_f/stat_count*100).to_s + "%" : value}}
-
-
row += [last_stat.amazon_sales_rank, (last_stat.amazon_price / 100.0 if last_stat.amazon_price.present?), Urls.amazon_book_page(warehouse_book_version.asin, '.com')]
-
row << warehouse_book_version.amazon_book_description
-
-
report_csv << row
-
end
-
end
-
-
5
report_csv.flush
-
-
# Pass reports array to mailer and deliver
-
5
EnterpriseReports.move_to_s3(client_name, report_csv)
-
5
EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports]["#{report_name}_identification"]).deliver
-
-
5
report_csv.close
-
-
10
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{report_name.to_s.camelcase} Identification Report Delivered -----"}
-
end
-
end
-
-
1
class SubcategoryAverageSalesRankReport
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string)
-
1
report_date = report_date_string.to_date
-
1
warehouse_date_id = WarehouseDate.find_by(date: report_date).id
-
1
warehouse_region_id = WarehouseRegion.com.id
-
1
warehouse_category_id_names = {}
-
2
WarehouseCategory.where(tld: ".com", status:"canonical",category_type: "AmazonBookCategory").value_of(:id,:name).map{|k,v| warehouse_category_id_names[k]=v}
-
1
client_name = :booklr
-
1
report_hash = EnterpriseReports.generate_report_hash("subcategory-average-salesrank-report-#{report_date.strftime("%m%d%y")}", client_name)
-
1
csv = EnterpriseReports.open_csv(report_hash)
-
1
csv << ['category_name', 'average_sales_rank']
-
1
warehouse_category_id_names.first(30).each do |cat_id,cat_name|
-
1
csv<<[cat_name,average_sales_rank_for_date_category_region(warehouse_date_id, cat_id, warehouse_region_id)]
-
end
-
1
csv.flush
-
-
1
EnterpriseReports.move_to_s3(client_name, csv)
-
1
csv.close
-
end
-
-
1
def self.average_sales_rank_for_date_category_region(warehouse_date_id, warehouse_category_id, warehouse_region_id = 1)
-
1
sales_ranks = WarehouseStat.where(warehouse_region_id:warehouse_region_id, warehouse_date_id:warehouse_date_id, warehouse_book_version_id:WarehouseListStat.where(warehouse_category_id:warehouse_category_id, warehouse_date_id:warehouse_date_id).value_of(:warehouse_book_version_id)-[nil]).value_of(:amazon_sales_rank).compact
-
1
sales_ranks.sum/sales_ranks.count if sales_ranks.present?
-
end
-
end
-
end
-
end
-
1
module EnterpriseReports
-
1
module WeeklyReports
-
1
class RHPG
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string, deliver_email = true)
-
2
report_date = report_date_string.to_date
-
2
warehouse_date_id = WarehouseDate.find_by(date: report_date).id
-
2
warehouse_last_week_date_id = WarehouseDate.find_by(date: report_date - 7.days).id
-
2
client_name = :rhpg
-
2
client_config = AmazeBot.config[:reports][:clients][client_name]
-
2
user = User.find_by email: 'rhpg@booklr.com'
-
2
asins = user.warehouse_book_versions.ingested.value_of(:asin)
-
-
4
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Customer Behavior report on #{asins.count} book versions for: #{user.name} -----"}
-
-
2
report_file_name = "rhpg-customer-behavior-report-#{report_date.strftime('%m%d%y')}"
-
2
report_hash = EnterpriseReports.generate_report_hash(report_file_name, client_name)
-
2
report_csv = EnterpriseReports.open_csv(report_hash)
-
-
report_csv << ['ISBN', 'ASIN', 'Book Title', 'Book Type', 'Pub Date', 'Author', 'B&N Number of Ratings', '# Change 1 Week', '% Change 1 Week',
-
'B&N Average Star Rating', '# Change 1 Week', '% Change 1 Week', 'AZ Number of Ratings', '# Change 1 Week',
-
'% Change 1 Week', 'AZ Average Star Rating', '# Change 1 Week', '% Change 1 Week', 'AZ Number 1 Star',
-
'AZ Number 2 Star', 'AZ Number 3 Star', 'AZ Number 4 Star', 'AZ Number 5 Star', 'AZ Number of Likes',
-
'AZ: Who Bought this Item also Bought Title 1', 'AZ: Who Bought this Item also Bought Author 1',
-
'AZ: Who Bought this Item also Bought Title 2', 'AZ: Who Bought this Item also Bought Author 2',
-
'AZ: Who Bought this Item also Bought Title 3', 'AZ: Who Bought this Item also Bought Author 3',
-
'AZ: Who Bought this Item also Bought Title 4', 'AZ: Who Bought this Item also Bought Author 4',
-
'AZ: Who Bought this Item also Bought Title 5', 'AZ: Who Bought this Item also Bought Author 5',
-
'AZ: Who Bought this Item also Bought Title 6', 'AZ: Who Bought this Item also Bought Author 6',
-
'AZ: Buy After Viewing This Item? Title 1', 'AZ: Buy After Viewing This Item? Author 1',
-
'AZ: Buy After Viewing This Item? Star Rating 1', 'AZ: Buy After Viewing This Item? Number of Ratings 1',
-
'AZ: Buy After Viewing This Item? Price 1', 'AZ: Buy After Viewing This Item? Title 2',
-
'AZ: Buy After Viewing This Item? Author 2', 'AZ: Buy After Viewing This Item? Star Rating 2',
-
'AZ: Buy After Viewing This Item? Number of Ratings 2', 'AZ: Buy After Viewing This Item? Price 2',
-
'AZ: Buy After Viewing This Item? Title 3', 'AZ: Buy After Viewing This Item? Author 3',
-
'AZ: Buy After Viewing This Item? Star Rating 3', 'AZ: Buy After Viewing This Item? Number of Ratings 3',
-
'AZ: Buy After Viewing This Item? Price 3', 'AZ: Buy After Viewing This Item? Title 4',
-
'AZ: Buy After Viewing This Item? Author 4', 'AZ: Buy After Viewing This Item? Star Rating 4',
-
'AZ: Buy After Viewing This Item? Number of Ratings 4', 'AZ: Buy After Viewing This Item? Price 4',
-
'AZ Frequently Bought Together Name 1', 'AZ Frequently Bought Together Type 1', 'AZ Frequently Bought Together Price 1',
-
'AZ Frequently Bought Together Name 2', 'AZ Frequently Bought Together Type 2', 'AZ Frequently Bought Together Price 2',
-
'Similar Items By Category 1', 'Similar Items By Category 2', 'Similar Items By Category 3', 'Similar Items By Category 4', 'Similar Items By Category 5',
-
'Similar Items By Category 6', 'Similar Items By Category 7', 'Similar Items By Category 8', 'Similar Items By Category 9', 'Similar Items By Category 10',
-
2
'Similar Items By Category 11', 'Similar Items By Category 12', 'Similar Items By Category 13', 'Similar Items By Category 14', 'Similar Items By Category 15']
-
-
2
warehouse_book_version_ids = WarehouseBookVersion.com.where(asin: asins).value_of(:id)
-
2
row_count = 0
-
-
2
(warehouse_book_version_ids.count / 1000 + 1).times do |count|
-
4
Rails.logger.tagged('enterprise') {Rails.logger.info "----- Generating report for #{(1000 * count)..(1000 * (count + 1) - 1)} out of #{warehouse_book_version_ids.count} book versions -----"}
-
2
sql = WarehouseStat.single_query_join_and_select(%w[bn_review_count bn_average_rating amazon_review_count amazon_average_rating one_star_count two_star_count three_star_count four_star_count five_star_count amazon_likes] +
-
72
WarehouseStat::WAREHOUSE_AMAZON_ALSO_BOUGHT_FIELDS.select {|field| field.include?('title') || field.include?('author')} +
-
48
WarehouseStat::WAREHOUSE_AMAZON_BOUGHT_AFTER_VIEWING_FIELDS.reject {|field| field.include?('asin')} +
-
WarehouseStat::WAREHOUSE_AMAZON_FREQUENTLY_BOUGHT_TOGETHER_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS,
-
2
{warehouse_book_version: %w[id isbn13 asin title book_format pub_date author_name]}, nil).where{(warehouse_stats.warehouse_book_version_id.in warehouse_book_version_ids[(1000 * count)..(1000 * (count + 1) - 1)])}.where(warehouse_region_id: WarehouseRegion.com.id, warehouse_date_id: [warehouse_date_id,warehouse_last_week_date_id]).order(:warehouse_book_version_id, :created_at).to_sql
-
-
6
ActiveRecord::Base.connection.execute(sql).chunk {|el| el['warehouse_book_version_id']}.each do |warehouse_book_version_id, arr|
-
2
stat = arr.count > 1 ? arr.second.with_indifferent_access : arr.first.with_indifferent_access
-
2
last_week_stat = arr.count > 1 ? arr.first.with_indifferent_access : nil
-
-
2
row = [stat[:warehouse_book_version_isbn13], stat[:warehouse_book_version_asin], stat[:warehouse_book_version_title],
-
stat[:warehouse_book_version_book_format], stat[:warehouse_book_version_pub_date], stat[:warehouse_book_version_author_name]]
-
-
2
if last_week_stat.present?
-
2
row += EnterpriseReports.output_field_and_change_and_percentage_change(stat, last_week_stat, :bn_review_count)
-
2
row += EnterpriseReports.output_field_and_change_and_percentage_change(stat, last_week_stat, :bn_average_rating, true)
-
2
row += EnterpriseReports.output_field_and_change_and_percentage_change(stat, last_week_stat, :amazon_review_count)
-
2
row += EnterpriseReports.output_field_and_change_and_percentage_change(stat, last_week_stat, :amazon_average_rating, true)
-
end
-
-
2
row += [stat[:one_star_count], stat[:two_star_count], stat[:three_star_count], stat[:four_star_count],
-
stat[:five_star_count], stat[:amazon_likes]]
-
-
2
row += EnterpriseReports.output_customer_behavior_fields(stat, 'amazon_also_bought')
-
2
row += EnterpriseReports.output_customer_behavior_fields(stat, 'bought_after_viewing')
-
2
row += EnterpriseReports.output_customer_behavior_fields(stat, 'frequently_bought')
-
2
row += EnterpriseReports.output_customer_behavior_fields(stat, 'similar_item')
-
-
2
report_csv << row
-
2
row_count += 1
-
end
-
end
-
-
2
report_csv.flush
-
-
# Upload finished report whether its complete or not
-
2
EnterpriseReports.move_to_s3(client_name, report_csv)
-
-
# Determine if report is complete and then email about it
-
2
if EnterpriseReports.report_count_valid? row_count, asins.count
-
1
EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports][:customer_behavior]).deliver if deliver_email
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{user.name.parameterize} Weekly Report Delivered -----"}
-
else
-
1
EnterpriseReports.send_report_count_error report_file_name, row_count, asins.count
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{user.name.parameterize}(#{client_name}) Weekly Report NOT Delivered: row count off by 0.5% of more -----"}
-
end
-
-
2
report_csv.close
-
end
-
end
-
-
1
class RhincCustomerBehavior
-
1
include EnterpriseReports
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string, ftp = true, deliver_email = true, ignore_report_blocker = false)
-
1
ActiveRecord::Base.connection.uncached do
-
1
report_date = report_date_string.to_date
-
1
selects = %w[bn_review_count bn_average_rating amazon_review_count amazon_average_rating
-
one_star_count two_star_count three_star_count four_star_count five_star_count amazon_likes] +
-
WarehouseStat::WAREHOUSE_AMAZON_ALSO_BOUGHT_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_BOUGHT_AFTER_VIEWING_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_FREQUENTLY_BOUGHT_TOGETHER_FIELDS +
-
WarehouseStat::WAREHOUSE_AMAZON_SIMILAR_ITEM_CATEGORY_TREE_FIELDS
-
1
v2_selects = WarehouseStat::WAREHOUSE_AMAZON_ALSO_BOUGHT_ITEMS_BY_FIELDS + WarehouseStat::WAREHOUSE_BN_ALSO_BOUGHT_FIELDS
-
1
author_rank_selects = 'overall_author_rank AS "amazon_overall_author_rank", ' +
-
'COALESCE(sub_category1_author_rank_id.name, sub_category1_author_rank_id_fallback) AS "amazon_sub_category1_author_rank_id_name", sub_category1_author_rank AS "amazon_sub_category1_author_rank", ' +
-
'COALESCE(sub_category2_author_rank_id.name, sub_category2_author_rank_id_fallback) AS "amazon_sub_category2_author_rank_id_name", sub_category2_author_rank AS "amazon_sub_category2_author_rank", ' +
-
'COALESCE(sub_category3_author_rank_id.name, sub_category3_author_rank_id_fallback) AS "amazon_sub_category3_author_rank_id_name", sub_category3_author_rank AS "amazon_sub_category3_author_rank", ' +
-
'COALESCE(sub_category4_author_rank_id.name, sub_category4_author_rank_id_fallback) AS "amazon_sub_category4_author_rank_id_name", sub_category4_author_rank AS "amazon_sub_category4_author_rank"'
-
1
sql = WarehouseStat.joins(:warehouse_book_version).outer_joins(:sub_category1_author_rank_id, :sub_category2_author_rank_id, :sub_category3_author_rank_id, :sub_category4_author_rank_id).transforming_select(true, selects).join_select('inner', false, warehouse_book_version: %w[title book_format asin isbn13 bn_id author_name]).transforming_select(true, v2_selects).select(author_rank_selects).where(warehouse_region_id: WarehouseRegion.com.id, warehouse_date_id: WarehouseDate.find_by(date: report_date).id).order(:warehouse_book_version_id).to_sql
-
1
sql_copy_to_csv_and_deliver_report(sql, :rhinc, "rhinc-all-customer-behavior-v2-#{report_date.strftime('%m%d%y')}", (ignore_report_blocker ? nil : WarehouseBookVersion.com.ingested.count), ftp: ftp, emailable_report_name: (deliver_email ? :customer_behavior : nil), gzip: true)
-
end
-
end
-
end
-
-
1
class RHDECompetitiveMongoReport
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(report_date_string = Date.current.to_s, deliver_email = true)
-
1
report_date = report_date_string.to_date
-
1
client_name = :rhde
-
1
client_config = AmazeBot.config[:reports][:clients][client_name]
-
-
1
user = User.find_by email: 'rhde@booklr.com'
-
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- German Competitive Report for: #{user.name} -----"}
-
-
1
report_file_name = "daily-competitive-titles-report-#{report_date.strftime('%m%d%y')}-#{user.name.parameterize}"
-
1
report_hash = EnterpriseReports.generate_report_hash(report_file_name, client_name)
-
1
csv = EnterpriseReports.open_csv(report_hash)
-
-
csv << ['ISBN13', 'WorkID', 'ASIN', 'Title', 'Author', 'Publisher', 'OSD', 'Format', 'Size', 'List Price', 'Digital List Price', 'Consumer Price',
-
'Rank', 'Language', 'Competitive?', 'Featured?', 'Category 1', 'Category 1 rank', 'Category 2', 'Category 2 rank', 'Category 3',
-
1
'Category 3 rank', 'Scraped On Date']
-
-
1
row_count = 0
-
-
1
collection = MongoUtilities.daily_collection(:de_competitive_format_data, report_date)
-
1
block_size = 1000
-
-
# Iterate over the collection with no timeout and do it in slices for optimized memory usage
-
# ignore broken_titles document and iterate over those separately at the end
-
1
collection.find({_id: {'$ne' => 'broken_titles'}}, timeout: false) do |cursor|
-
1
total = collection.count
-
1
i = 1
-
1
cursor.each_slice(block_size) do |slice|
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "#{i*block_size} out of #{total}"}
-
1
i += 1
-
-
# Each document (per slice) represents all data for a single parent asin/ WorkID. Each document has 2 arrays
-
# of hashes, one for RHDE titles and one for the competitive titles
-
1
slice.each do |title_family|
-
# combine both array of hashes and output to the csv
-
1
titles = []
-
1
titles += title_family['rhde_titles'] if title_family['rhde_titles'].present?
-
1
titles += title_family['competitive_titles'].map {|x| x.merge!('competitive' => true)} if title_family['competitive_titles'].present?
-
1
parent_asin = title_family['_id'].split('-').first
-
-
1
titles.each do |title|
-
1
pub_date = ScraperUtilities.parse_date_string(title['pub_date']) || 'invalid'
-
-
# 2nd column is WorkID unless it doesn't exist then we use the parent_asin to tie titles together
-
1
row = [title['isbn13'], (title_family['WorkID'] || parent_asin), title['asin'], title['title'],
-
title['author'], title['publisher'], pub_date.to_s, title['book_format'], title['physical_details'],
-
title['amazon_list_price'].try(:to_i).try(:/, 100.0), title['digital_list_price'].try(:to_i).try(:/, 100.0),
-
title['amazon_price'].try(:to_i).try(:/, 100.0), title['amazon_sales_rank'], title['language']]
-
1
row << (title['competitive'] ? 'Y' : 'N')
-
# title_family["featured"] is an array of all of the featured asins for this WorkID/parent asin
-
1
row << (title_family['featured'].include?(title['asin']) ? 'Y' : 'N')
-
1
row += [title['sub_category1_tree'], title['sub_category1_rank'], title['sub_category2_tree'], title['sub_category2_rank'], title['sub_category3_tree'], title['sub_category3_rank'], Date.current.to_s]
-
-
24
csv << row.collect {|value| value.is_a?(String) ? ActiveSupport::Inflector.transliterate(value) : value}
-
1
row_count += 1
-
end
-
end
-
end
-
end
-
-
1
if collection.find({_id: 'broken_titles'}).count > 0
-
1
csv << ['----------- BROKEN TITLES -----------']
-
-
1
row_count += 1
-
1
i = 1
-
1
total = collection.find({_id: 'broken_titles'}).first['titles'].count
-
-
# Iterate over broken titles and output as much data is available from the page
-
1
collection.find({_id: 'broken_titles'}).first['titles'].each do |title|
-
1
Rails.logger.tagged('enterprise') {Rails.logger.info "#{i} out of #{total}"} if i % 1000 == 0
-
1
i += 1
-
1
pub_date = title['pub_date'].present? ? (ScraperUtilities.parse_date_string(title['pub_date']).to_s || 'invalid') : ''
-
-
# Always has N/A on the featured title column, broken titles can never be featured
-
1
row = [title['isbn13'], title['work_id'], title['asin'], title['title'], title['author'], title['publisher'],
-
pub_date, title['book_format'], title['physical_details'], title['amazon_list_price'].try(:to_i).try(:/, 100.0),
-
title['digital_list_price'].try(:to_i).try(:/, 100.0), title['amazon_price'].try(:to_i).try(:/, 100.0),
-
title['amazon_sales_rank'], title['language'], 'N', 'N/A', title['sub_category1_tree'], title['sub_category1_rank'],
-
title['sub_category2_tree'], title['sub_category2_rank'], title['sub_category3_tree'],
-
23
title['sub_category3_rank'], Date.current.to_s].collect {|value| value.is_a?(String) ? ActiveSupport::Inflector.transliterate(value) : value}
-
-
1
csv << row
-
1
row_count += 1
-
end
-
end
-
-
1
csv.flush
-
-
# Upload finished report whether its complete or not
-
1
EnterpriseReports.move_to_s3(client_name, csv)
-
-
# Set redis details and mail report
-
1
$redis.hmset('daily_report_stats', 'rhde-row-count', row_count, 'rhde-send-time', Time.current.to_s)
-
1
EnterpriseReportsMailer.basic_report(report_hash, client_config[:reports][:daily_report]).deliver if deliver_email
-
2
Rails.logger.tagged('enterprise') {Rails.logger.info "----- #{user.name.parameterize} German Competitive Report Delivered -----"}
-
-
1
csv.close
-
end
-
end
-
end
-
end
-
1
module EtlWorkers
-
1
class QueueNightlyEtl
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform(date_string = Date.current.to_s)
-
1
date = date_string.to_date
-
1
warehouse_date_id = WarehouseDate.where(date: date).first_or_create.id
-
1
EtlWorkers::NightlyStatsEtl.process_date(date, warehouse_date_id, true, true, true)
-
1
EtlWorkers::NightlyListStatEtl.process_date(date, warehouse_date_id)
-
end
-
end
-
-
1
class NightlyStatsEtl
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :etl
-
-
1
BATCH_SIZE = 100000
-
-
1
def perform(collection_name, starting_id, ending_id, warehouse_region_com_id, warehouse_region_co_uk_id, warehouse_date_id, date_string, etl_product_stats, etl_book_version_categories, etl_category_stats)
-
248
t = Benchmark.realtime do
-
248
WarehouseRegion.connection.uncached do
-
248
collection = $mongodb.collection collection_name
-
248
existing_record = collection.find({warehouse_book_version_id: {'$gte' => starting_id, '$lte' => ending_id}}).limit(1).first
-
248
return if etl_product_stats && existing_record.present? && WarehouseStat.where(warehouse_date_id: warehouse_date_id, warehouse_book_version_id: existing_record['warehouse_book_version_id']).exists?
-
-
248
timestamp = Time.current.utc
-
248
errmsg = stat_connection = categories_connection = category_stats = nil
-
248
warehouse_categories_by_tld = Utilities::TLDS.each_with_object({}) do |tld, hash|
-
512
hash[tld] = WarehouseCategory.canonical.where(tld: tld).value_of(:name, :id).each_with_object({}) {|name_and_id, hash| hash[name_and_id[0]] = name_and_id[1]}
-
end
-
248
if etl_product_stats
-
248
stat_connection = WarehouseStat.connection.raw_connection
-
248
stat_connection.exec("COPY #{WarehouseStat.table_name} (#{warehouse_stats_columns.join(',')}) FROM STDIN CSV NULL '' QUOTE '\"' ESCAPE '\\'")
-
-
58776
field_counts = Hash[*warehouse_stats_columns.collect{|k| [k, 0]}.flatten]
-
248
field_counts['bn_related_format_data_total'] = 0
-
248
field_counts['amazon_related_format_data_total'] = 0
-
end
-
248
if etl_book_version_categories
-
categories_connection = PostgresUtilities.get_new_connection(BookVersionCategory)
-
categories_connection.exec("COPY #{BookVersionCategory.table_name} (#{book_version_categories_columns.join(',')}) FROM STDIN CSV NULL '' QUOTE '\"' ESCAPE '\\'")
-
end
-
248
category_stats = {} if etl_category_stats
-
-
248
begin
-
248
collection.find({warehouse_book_version_id: {'$gte' => starting_id, '$lte' => ending_id}}, timeout: false) do |cursor|
-
248
cursor.each do |record|
-
249
warehouse_region_id = record['tld'] == '.com' ? warehouse_region_com_id : warehouse_region_co_uk_id
-
249
warehouse_category_ids_by_name = warehouse_categories_by_tld[record['tld']]
-
-
249
if etl_product_stats
-
249
stats = process_record_for_warehouse_stats stat_connection, record, warehouse_category_ids_by_name, warehouse_region_id, warehouse_date_id, timestamp
-
-
# Calculate booklr stat counts
-
249
bn_related_format_present = false
-
249
amazon_related_format_present = false
-
249
warehouse_stats_columns.zip(stats).each do |field, stat|
-
58764
if stat.present?
-
56514
field_counts[field] += 1
-
56514
bn_related_format_present = true if %w[related_formats_nook_ean related_formats_nook_price].include? field
-
56514
amazon_related_format_present = true if %w[related_formats_kindle_price related_formats_kindle_asin related_formats_hardcover_price related_formats_hardcover_asin related_formats_mass_market_paperback_price related_formats_mass_market_paperback_asin related_formats_paperback_price related_formats_paperback_asin].include? field
-
end
-
end
-
249
field_counts['bn_related_format_data_total'] += 1 if bn_related_format_present
-
249
field_counts['amazon_related_format_data_total'] += 1 if amazon_related_format_present
-
end
-
-
249
process_record_for_book_version_categories categories_connection, record, warehouse_category_ids_by_name, warehouse_region_id if etl_book_version_categories
-
249
process_record_for_category_stats record, category_stats, warehouse_category_ids_by_name, warehouse_region_id if etl_category_stats
-
end
-
-
248
if etl_product_stats
-
248
stat_collection = MongoUtilities.daily_scrape_field_counts_collection
-
248
stat_collection.update({date: date_string}, {'$inc' => field_counts})
-
end
-
end
-
-
248
if etl_category_stats && category_stats.present?
-
CategoryStatsCollection.new(date_string.to_date).add_category_stats category_stats
-
end
-
rescue Errno => err
-
errmsg = '%s while reading copy data: %s' % [err.class.name, err.message]
-
end
-
-
248
PostgresUtilities.finalize_copy_command stat_connection, errmsg if etl_product_stats
-
248
PostgresUtilities.finalize_copy_command categories_connection, errmsg if etl_book_version_categories
-
end
-
end
-
-
496
Rails.logger.tagged('stats') {Rails.logger.info "NightlyStatsEtl Time taken: #{t}"}
-
end
-
-
1
def process_record_for_warehouse_stats(connection, record, warehouse_category_ids_by_name, warehouse_region_id, warehouse_date_id, timestamp)
-
249
stats = [timestamp, timestamp, Utilities.prepare_string_for_copy(record['_id']),
-
warehouse_category_ids_by_name[record['sub_category1_tree']], Utilities.prepare_integer_for_copy(record['sub_category1_rank']),
-
warehouse_category_ids_by_name[record['sub_category2_tree']], Utilities.prepare_integer_for_copy(record['sub_category2_rank']),
-
warehouse_category_ids_by_name[record['sub_category3_tree']], Utilities.prepare_integer_for_copy(record['sub_category3_rank']),
-
record['book_version_stat_id'], warehouse_date_id, warehouse_region_id, record['warehouse_book_version_id'],
-
warehouse_category_ids_by_name[record['amazon_sales_rank_category']], Utilities.prepare_integer_for_copy(record['amazon_sales_rank']),
-
Utilities.prepare_integer_for_copy(record['barnes_sales_rank']), Utilities.prepare_integer_for_copy(record['likes']),
-
Utilities.prepare_integer_for_copy(record['amazon_list_price']), Utilities.prepare_integer_for_copy(record['amazon_price']),
-
Utilities.prepare_integer_for_copy(record['digital_list_price']), Utilities.prepare_integer_for_copy(record['bn_nook_price']),
-
Utilities.prepare_integer_for_copy(record['bn_nook_list_price']), Utilities.prepare_integer_for_copy(record['amazon_euro_price']),
-
Utilities.prepare_integer_for_copy(record['amazon_aus_price']), Utilities.prepare_integer_for_copy(record['bn_price']),
-
Utilities.prepare_integer_for_copy(record['bn_list_price']), Utilities.prepare_integer_for_copy(record['itunes_price']),
-
Utilities.prepare_float_for_copy(record['itunes_average_rating']), Utilities.prepare_integer_for_copy(record['itunes_rating_count']),
-
Utilities.prepare_integer_for_copy(record['stumbleupon_count']), Utilities.prepare_integer_for_copy(record['reddit_count']),
-
Utilities.prepare_integer_for_copy(record['fb_commentsbox_count']), Utilities.prepare_integer_for_copy(record['fb_click_count']),
-
Utilities.prepare_integer_for_copy(record['fb_comment_count']), Utilities.prepare_integer_for_copy(record['fb_like_count']),
-
Utilities.prepare_integer_for_copy(record['fb_share_count']), Utilities.prepare_integer_for_copy(record['delicious_count']),
-
Utilities.prepare_integer_for_copy(record['google_plus_count']), Utilities.prepare_integer_for_copy(record['twitter_count']),
-
Utilities.prepare_integer_for_copy(record['digg_count']), Utilities.prepare_integer_for_copy(record['pinterest_count']),
-
Utilities.prepare_integer_for_copy(record['linkedin_count']), Utilities.prepare_float_for_copy(record['amazon_average_rating']),
-
Utilities.prepare_integer_for_copy(record['amazon_review_count']), Utilities.prepare_float_for_copy(record['barnes_average_rating']),
-
Utilities.prepare_integer_for_copy(record['barnes_review_count']), Utilities.prepare_string_for_copy(record['amazon_availability']),
-
record['kindle_unlimited'], Utilities.prepare_float_for_copy(record['goodreads_work_average_rating']),
-
Utilities.prepare_integer_for_copy(record['goodreads_work_rating_count']), Utilities.prepare_integer_for_copy(record['goodreads_work_review_count']),
-
Utilities.prepare_integer_for_copy(record['goodreads_work_added_by_count']), Utilities.prepare_integer_for_copy(record['goodreads_work_to_read_count']),
-
Utilities.prepare_float_for_copy(record['goodreads_edition_average_rating']), Utilities.prepare_integer_for_copy(record['goodreads_edition_rating_count']),
-
Utilities.prepare_integer_for_copy(record['goodreads_edition_review_count']), Utilities.prepare_integer_for_copy(record['goodreads_edition_added_by_count']),
-
Utilities.prepare_integer_for_copy(record['goodreads_5_star_count']), Utilities.prepare_integer_for_copy(record['goodreads_4_star_count']),
-
Utilities.prepare_integer_for_copy(record['goodreads_3_star_count']), Utilities.prepare_integer_for_copy(record['goodreads_2_star_count']),
-
Utilities.prepare_integer_for_copy(record['goodreads_1_star_count']),
-
Utilities.prepare_integer_for_copy(record['five_star_count']), Utilities.prepare_integer_for_copy(record['four_star_count']),
-
Utilities.prepare_integer_for_copy(record['three_star_count']), Utilities.prepare_integer_for_copy(record['two_star_count']),
-
Utilities.prepare_integer_for_copy(record['one_star_count']), Utilities.prepare_float_for_copy(record['itunes_gb_average_rating']),
-
Utilities.prepare_integer_for_copy(record['itunes_gb_rating_count']), Utilities.prepare_integer_for_copy(record['itunes_gb_price']),
-
Utilities.prepare_float_for_copy(record['itunes_au_average_rating']), Utilities.prepare_integer_for_copy(record['itunes_au_rating_count']),
-
Utilities.prepare_integer_for_copy(record['itunes_au_price'])]
-
249
stats += [(warehouse_category_ids_by_name[record['amazon_sales_rank_category']].present? ? nil : Utilities.prepare_string_for_copy(record['amazon_sales_rank_category'])),
-
249
(warehouse_category_ids_by_name[record['sub_category1_tree']].present? ? nil : Utilities.prepare_string_for_copy(record['sub_category1_tree'])),
-
249
(warehouse_category_ids_by_name[record['sub_category2_tree']].present? ? nil : Utilities.prepare_string_for_copy(record['sub_category2_tree'])),
-
249
(warehouse_category_ids_by_name[record['sub_category3_tree']].present? ? nil : Utilities.prepare_string_for_copy(record['sub_category3_tree']))]
-
249
stats += [Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_1']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_2']),
-
Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_3']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_4']),
-
Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_5']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_6']),
-
Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_7']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_8']),
-
Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_9']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_10']),
-
Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_11']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_12']),
-
Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_13']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_14']),
-
Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_15']), Utilities.prepare_string_for_copy(record['amazon_also_bought_items_by_16'])]
-
249
stats += Utilities.pad_customer_behavior_data(record['also_bought'].try(:first, 6), 36) do |column_values, data, index|
-
1494
column_values[index * 6] = Utilities.prepare_string_for_copy(data['isbn_or_asin'])
-
1494
column_values[index * 6 + 1] = Utilities.prepare_string_for_copy(data['author'])
-
1494
column_values[index * 6 + 2] = Utilities.prepare_integer_for_copy(Utilities.ignore_bad_price_for_copy data['price'])
-
1494
column_values[index * 6 + 3] = Utilities.prepare_integer_for_copy(data['rating_count'])
-
1494
column_values[index * 6 + 4] = Utilities.prepare_float_for_copy(data['star_count'].try(:[], 0..-16))
-
1494
column_values[index * 6 + 5] = Utilities.prepare_string_for_copy(data['title'])
-
end
-
249
stats += Utilities.pad_customer_behavior_data(record['bought_after_viewing'].try(:first, 4), 24) do |column_values, data, index|
-
996
column_values[index * 6] = Utilities.prepare_string_for_copy(data['isbn_or_asin'])
-
996
column_values[index * 6 + 1] = Utilities.prepare_string_for_copy(data['author'])
-
996
column_values[index * 6 + 2] = Utilities.prepare_integer_for_copy(Utilities.ignore_bad_price_for_copy data['price'])
-
996
column_values[index * 6 + 3] = Utilities.prepare_integer_for_copy(data['rating_count'])
-
996
column_values[index * 6 + 4] = Utilities.prepare_float_for_copy(data['star_count'].try(:[], 0..-16))
-
996
column_values[index * 6 + 5] = Utilities.prepare_string_for_copy(data['title'])
-
end
-
996
stats += Utilities.pad_customer_behavior_data(record['frequently_bought_together'].try(:reject) {|x| x['title'].include? 'This item: '}.try(:first, 2), 6) do |column_values, data, index|
-
498
column_values[index * 3] = Utilities.prepare_string_for_copy(data['type'])
-
498
column_values[index * 3 + 1] = Utilities.prepare_integer_for_copy(data['price'])
-
498
column_values[index * 3 + 2] = Utilities.prepare_string_for_copy(data['title'])
-
end
-
249
stats += Utilities.pad_customer_behavior_data(record['similar_items_by_category'].try(:first, 15), 15) do |column_values, data, index|
-
3735
column_values[index] = Utilities.prepare_string_for_copy(data)
-
end
-
249
stats += Utilities.pad_customer_behavior_data(record['similar_items_by_category_external_id'].try(:first, 15), 15) do |column_values, data, index|
-
3735
column_values[index] = Utilities.prepare_string_for_copy(data)
-
end
-
249
stats += Utilities.pad_customer_behavior_data(record['bn_also_bought'].try(:first, 6), 24) do |column_values, data, index|
-
1494
column_values[index * 4] = Utilities.prepare_string_for_copy(data['ean'])
-
1494
column_values[index * 4 + 1] = Utilities.prepare_string_for_copy(data['author'])
-
1494
column_values[index * 4 + 2] = Utilities.prepare_integer_for_copy(data['price'])
-
1494
column_values[index * 4 + 3] = Utilities.prepare_string_for_copy(data['title'])
-
end
-
249
stats += [Utilities.prepare_integer_for_copy(record['Kindle Edition'].try(:[], 'price')), Utilities.prepare_string_for_copy(record['Kindle Edition'].try(:[], 'asin')),
-
Utilities.prepare_integer_for_copy(record['MassMarketPaperback'].try(:[], 'price')), Utilities.prepare_string_for_copy(record['MassMarketPaperback'].try(:[], 'asin')),
-
Utilities.prepare_integer_for_copy(record['NOOK Book'].try(:[], 'price')), Utilities.prepare_string_for_copy(record['NOOK Book'].try(:[], 'ean')),
-
Utilities.prepare_integer_for_copy(record['Hardcover'].try(:[], 'price')), Utilities.prepare_string_for_copy(record['Hardcover'].try(:[], 'asin')),
-
Utilities.prepare_integer_for_copy(record['Paperback'].try(:[], 'price')), Utilities.prepare_string_for_copy(record['Paperback'].try(:[], 'asin'))]
-
-
249
if record['author_ranks'].present?
-
248
sub_category_1 = record['author_ranks']['sub_category_1']
-
248
sub_category_2 = record['author_ranks']['sub_category_2']
-
248
sub_category_3 = record['author_ranks']['sub_category_3']
-
248
sub_category_4 = record['author_ranks']['sub_category_4']
-
248
stats += [Utilities.prepare_integer_for_copy(record['author_ranks']['overall_rank']),
-
248
(Utilities.prepare_integer_for_copy(sub_category_1['rank']) if sub_category_1.present?),
-
248
(Utilities.prepare_integer_for_copy(sub_category_2['rank']) if sub_category_2.present?),
-
248
(Utilities.prepare_integer_for_copy(sub_category_3['rank']) if sub_category_3.present?),
-
248
(Utilities.prepare_integer_for_copy(sub_category_4['rank']) if sub_category_4.present?),
-
248
(warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_1['category_name'])] if sub_category_1.present?),
-
248
(warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_2['category_name'])] if sub_category_2.present?),
-
248
(warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_3['category_name'])] if sub_category_3.present?),
-
248
(warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_4['category_name'])] if sub_category_4.present?),
-
248
(sub_category_1.blank? || warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_1['category_name'])].present? ? nil : Utilities.prepare_string_for_copy(ScraperUtilities.convert_author_rank_category_name(sub_category_1['category_name']))),
-
248
(sub_category_2.blank? || warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_2['category_name'])].present? ? nil : Utilities.prepare_string_for_copy(ScraperUtilities.convert_author_rank_category_name(sub_category_2['category_name']))),
-
248
(sub_category_3.blank? || warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_3['category_name'])].present? ? nil : Utilities.prepare_string_for_copy(ScraperUtilities.convert_author_rank_category_name(sub_category_3['category_name']))),
-
248
(sub_category_4.blank? || warehouse_category_ids_by_name[ScraperUtilities.convert_author_rank_category_name(sub_category_4['category_name'])].present? ? nil : Utilities.prepare_string_for_copy(ScraperUtilities.convert_author_rank_category_name(sub_category_4['category_name'])))]
-
else
-
1
stats += Array.new(13)
-
end
-
-
249
connection.put_copy_data("#{stats.join(',')}\n")
-
-
249
stats
-
end
-
-
1
def process_record_for_book_version_categories(connection, record, warehouse_category_ids_by_name, warehouse_region_id)
-
if record['similar_items_by_category'].present?
-
# put book_version_categories data
-
record['similar_items_by_category'].collect do |category_name|
-
row = [record['warehouse_book_version_id'], Utilities.prepare_string_for_copy(category_name),
-
warehouse_category_ids_by_name[category_name], warehouse_region_id]
-
connection.put_copy_data("#{row.join(',')}\n")
-
-
row
-
end
-
end
-
end
-
-
1
def process_record_for_category_stats(record, category_stats, warehouse_category_ids_by_name, warehouse_region_id)
-
rank = Utilities.prepare_integer_for_copy(record['amazon_sales_rank'])
-
if rank.present? && record['similar_items_by_category'].present?
-
record['similar_items_by_category'].collect do |category_name|
-
category_key = category_name.gsub('.', '')
-
category_stats[category_key] ||= {best_rank: rank, worst_rank: rank, ranks: [], category_name: category_name,
-
warehouse_region_id: warehouse_region_id,
-
warehouse_category_id: warehouse_category_ids_by_name[category_name],
-
best_rank_book_version_id: record['warehouse_book_version_id'],
-
worst_rank_book_version_id: record['warehouse_book_version_id']}
-
if rank < category_stats[category_key][:best_rank]
-
category_stats[category_key][:best_rank] = rank
-
category_stats[category_key][:best_rank_book_version_id] = record['warehouse_book_version_id']
-
end
-
if rank > category_stats[category_key][:worst_rank]
-
category_stats[category_key][:worst_rank] = rank
-
category_stats[category_key][:worst_rank_book_version_id] = record['warehouse_book_version_id']
-
end
-
category_stats[category_key][:ranks] << rank
-
end
-
end
-
end
-
-
1
def warehouse_stats_columns
-
745
%w[created_at updated_at mongo_id
-
-
warehouse_amazon_category1_id amazon_category1_rank
-
warehouse_amazon_category2_id amazon_category2_rank
-
warehouse_amazon_category3_id amazon_category3_rank
-
tx_book_version_stat_id warehouse_date_id warehouse_region_id warehouse_book_version_id
-
warehouse_amazon_sales_rank_category_id amazon_sales_rank bn_sales_rank amazon_likes amazon_list_price
-
amazon_price amazon_digital_list_price bn_nook_price bn_nook_list_price amazon_euro_price
-
amazon_aus_price bn_price bn_list_price itunes_price itunes_average_rating itunes_rating_count
-
stumbleupon_count reddit_count fb_commentsbox_count fb_click_count fb_comment_count fb_like_count
-
fb_share_count delicious_count google_plus_count twitter_count digg_count pinterest_count
-
linkedin_count amazon_average_rating amazon_review_count bn_average_rating bn_review_count
-
amazon_availability kindle_unlimited
-
goodreads_work_average_rating goodreads_work_rating_count goodreads_work_review_count
-
goodreads_work_added_by_count goodreads_work_to_read_count goodreads_edition_average_rating
-
goodreads_edition_rating_count goodreads_edition_review_count goodreads_edition_added_by_count
-
goodreads_5_star_count goodreads_4_star_count goodreads_3_star_count goodreads_2_star_count goodreads_1_star_count
-
five_star_count four_star_count three_star_count two_star_count one_star_count
-
itunes_gb_average_rating itunes_gb_rating_count itunes_gb_price
-
itunes_au_average_rating itunes_au_rating_count itunes_au_price
-
warehouse_amazon_sales_rank_category_id_fallback warehouse_amazon_category1_id_fallback
-
warehouse_amazon_category2_id_fallback warehouse_amazon_category3_id_fallback
-
amazon_also_bought_items_by_1 amazon_also_bought_items_by_2 amazon_also_bought_items_by_3
-
amazon_also_bought_items_by_4 amazon_also_bought_items_by_5 amazon_also_bought_items_by_6
-
amazon_also_bought_items_by_7 amazon_also_bought_items_by_8 amazon_also_bought_items_by_9
-
amazon_also_bought_items_by_10 amazon_also_bought_items_by_11 amazon_also_bought_items_by_12
-
amazon_also_bought_items_by_13 amazon_also_bought_items_by_14 amazon_also_bought_items_by_15
-
amazon_also_bought_items_by_16
-
amazon_also_bought_asin_1 amazon_also_bought_author_1 amazon_also_bought_price_1
-
amazon_also_bought_rating_1 amazon_also_bought_star_count_1 amazon_also_bought_title_1
-
amazon_also_bought_asin_2 amazon_also_bought_author_2 amazon_also_bought_price_2
-
amazon_also_bought_rating_2 amazon_also_bought_star_count_2 amazon_also_bought_title_2
-
amazon_also_bought_asin_3 amazon_also_bought_author_3 amazon_also_bought_price_3
-
amazon_also_bought_rating_3 amazon_also_bought_star_count_3 amazon_also_bought_title_3
-
amazon_also_bought_asin_4 amazon_also_bought_author_4 amazon_also_bought_price_4
-
amazon_also_bought_rating_4 amazon_also_bought_star_count_4 amazon_also_bought_title_4
-
amazon_also_bought_asin_5 amazon_also_bought_author_5 amazon_also_bought_price_5
-
amazon_also_bought_rating_5 amazon_also_bought_star_count_5 amazon_also_bought_title_5
-
amazon_also_bought_asin_6 amazon_also_bought_author_6 amazon_also_bought_price_6
-
amazon_also_bought_rating_6 amazon_also_bought_star_count_6 amazon_also_bought_title_6
-
amazon_bought_after_viewing_asin_1 amazon_bought_after_viewing_author_1 amazon_bought_after_viewing_price_1
-
amazon_bought_after_viewing_rating_1 amazon_bought_after_viewing_star_count_1 amazon_bought_after_viewing_title_1
-
amazon_bought_after_viewing_asin_2 amazon_bought_after_viewing_author_2 amazon_bought_after_viewing_price_2
-
amazon_bought_after_viewing_rating_2 amazon_bought_after_viewing_star_count_2 amazon_bought_after_viewing_title_2
-
amazon_bought_after_viewing_asin_3 amazon_bought_after_viewing_author_3 amazon_bought_after_viewing_price_3
-
amazon_bought_after_viewing_rating_3 amazon_bought_after_viewing_star_count_3 amazon_bought_after_viewing_title_3
-
amazon_bought_after_viewing_asin_4 amazon_bought_after_viewing_author_4 amazon_bought_after_viewing_price_4
-
amazon_bought_after_viewing_rating_4 amazon_bought_after_viewing_star_count_4 amazon_bought_after_viewing_title_4
-
amazon_frequently_bought_together_format_1 amazon_frequently_bought_together_price_1
-
amazon_frequently_bought_together_title_1 amazon_frequently_bought_together_format_2
-
amazon_frequently_bought_together_price_2 amazon_frequently_bought_together_title_2
-
amazon_similar_item_category_tree_1 amazon_similar_item_category_tree_2
-
amazon_similar_item_category_tree_3 amazon_similar_item_category_tree_4
-
amazon_similar_item_category_tree_5 amazon_similar_item_category_tree_6
-
amazon_similar_item_category_tree_7 amazon_similar_item_category_tree_8
-
amazon_similar_item_category_tree_9 amazon_similar_item_category_tree_10
-
amazon_similar_item_category_tree_11 amazon_similar_item_category_tree_12
-
amazon_similar_item_category_tree_13 amazon_similar_item_category_tree_14
-
amazon_similar_item_category_tree_15
-
amazon_similar_item_category_external_id_1 amazon_similar_item_category_external_id_2
-
amazon_similar_item_category_external_id_3 amazon_similar_item_category_external_id_4
-
amazon_similar_item_category_external_id_5 amazon_similar_item_category_external_id_6
-
amazon_similar_item_category_external_id_7 amazon_similar_item_category_external_id_8
-
amazon_similar_item_category_external_id_9 amazon_similar_item_category_external_id_10
-
amazon_similar_item_category_external_id_11 amazon_similar_item_category_external_id_12
-
amazon_similar_item_category_external_id_13 amazon_similar_item_category_external_id_14
-
amazon_similar_item_category_external_id_15
-
bn_also_bought_ean_1 bn_also_bought_author_1 bn_also_bought_price_1 bn_also_bought_title_1
-
bn_also_bought_ean_2 bn_also_bought_author_2 bn_also_bought_price_2 bn_also_bought_title_2
-
bn_also_bought_ean_3 bn_also_bought_author_3 bn_also_bought_price_3 bn_also_bought_title_3
-
bn_also_bought_ean_4 bn_also_bought_author_4 bn_also_bought_price_4 bn_also_bought_title_4
-
bn_also_bought_ean_5 bn_also_bought_author_5 bn_also_bought_price_5 bn_also_bought_title_5
-
bn_also_bought_ean_6 bn_also_bought_author_6 bn_also_bought_price_6 bn_also_bought_title_6
-
related_formats_kindle_price related_formats_kindle_asin related_formats_mass_market_paperback_price
-
related_formats_mass_market_paperback_asin related_formats_nook_price related_formats_nook_ean
-
related_formats_hardcover_price related_formats_hardcover_asin related_formats_paperback_price
-
related_formats_paperback_asin
-
overall_author_rank sub_category1_author_rank sub_category2_author_rank sub_category3_author_rank sub_category4_author_rank
-
sub_category1_author_rank_id sub_category2_author_rank_id sub_category3_author_rank_id sub_category4_author_rank_id
-
sub_category1_author_rank_id_fallback sub_category2_author_rank_id_fallback
-
sub_category3_author_rank_id_fallback sub_category4_author_rank_id_fallback]
-
end
-
-
1
def book_version_categories_columns
-
%w[warehouse_book_version_id category_name warehouse_category_id warehouse_region_id]
-
end
-
-
1
def self.process_date(date_or_date_string, warehouse_date_id, etl_product_stats, etl_book_version_categories, etl_category_stats)
-
2
date_string = date_or_date_string.is_a?(Date) ? date_or_date_string.to_s : date_or_date_string
-
2
BookVersionCategory.connection.execute("truncate #{BookVersionCategory.table_name}") if etl_book_version_categories
-
-
2
warehouse_region_com_id = WarehouseRegion.com.id
-
2
warehouse_region_co_uk_id = WarehouseRegion.couk.id
-
2
MongoUtilities.initialize_daily_scrape_field_counts(date_string)
-
-
2
collection = MongoUtilities.daily_collection(:stats, date_string.to_date)
-
-
2
ids = WarehouseBookVersion.connection.execute("SELECT t.id FROM (SELECT id::varchar(255), row_number() OVER(ORDER BY id::varchar(255) ASC) AS row_asc FROM warehouse_book_versions where status = 'ingested') t WHERE t.row_asc % #{BATCH_SIZE} = 0 OR t.row_asc % #{BATCH_SIZE} = #{BATCH_SIZE - 1} OR t.row_asc = 1").values.flatten
-
2
ids += WarehouseBookVersion.connection.execute("SELECT id::varchar(255) FROM warehouse_book_versions where status = 'ingested' order by id::varchar(255) DESC limit 1").values.flatten
-
2
params = ids.uniq.each_slice(2).collect do |slice|
-
3
[collection.name, slice.first, slice.last, warehouse_region_com_id, warehouse_region_co_uk_id, warehouse_date_id, date_string, etl_product_stats, etl_book_version_categories, etl_category_stats]
-
end.compact
-
-
2
Sidekiq::Client.push_bulk 'class' => EtlWorkers::NightlyStatsEtl, 'args' => params if params.present?
-
end
-
end
-
-
1
class NightlyListStatEtl
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :etl
-
1
BATCH_SIZE = 5000
-
-
1
def perform(collection_name, warehouse_date_id, batch_number)
-
14
collection = $mongodb.collection(collection_name)
-
14
warehouse_list_stat_columns = %w[created_at updated_at mongo_id
-
title name rank days_in_top_100 price author warehouse_trend_id warehouse_book_version_id
-
warehouse_category_id warehouse_date_id asin isbn bn_id itunes_id list_type]
-
14
timestamp = Time.current.utc
-
-
14
warehouse_trend_id_by_name = WarehouseTrend.value_of(:name, :id).each_with_object({}) do |name_and_id, hash|
-
1
hash[name_and_id[0]] = name_and_id[1]
-
end
-
44
asins = collection.find.skip(batch_number * BATCH_SIZE).limit(BATCH_SIZE).collect {|record| record['stats'].collect {|stat| stat['asin']}}.flatten.compact
-
44
isbn13s = collection.find.skip(batch_number * BATCH_SIZE).limit(BATCH_SIZE).collect {|record| record['stats'].collect {|stat| stat['isbn']}}.flatten.compact
-
14
warehouse_book_version_id_by_tld_and_asin = WarehouseBookVersion.where(asin: asins).value_of(:tld, :asin, :id).each_with_object({'.com' => {}, '.co.uk' => {}}) do |asin_and_id, hash|
-
2
hash[asin_and_id[0]][asin_and_id[1]] = asin_and_id[2]
-
end
-
14
warehouse_book_version_id_by_tld_and_isbn13 = WarehouseBookVersion.where(isbn13: isbn13s).value_of(:tld, :isbn13, :id).each_with_object({'.com' => {}, '.co.uk' => {}}) do |isbn13_and_id, hash|
-
1
hash[isbn13_and_id[0]][isbn13_and_id[1]] = isbn13_and_id[2]
-
end
-
-
# TODO: mongo_id is not indexed, it needs to be before this query will return in a reasonable time
-
# record = collection.find({}).skip(batch_number * BATCH_SIZE).limit(1).first
-
# return if WarehouseListStat.where(warehouse_date_id: warehouse_date_id, mongo_id: record['mongo_id']).exists?
-
-
14
connection = WarehouseListStat.connection.raw_connection
-
14
sql = "COPY warehouse_list_stats (#{warehouse_list_stat_columns.join(',')}) FROM STDIN CSV NULL '' QUOTE '\"' ESCAPE '\\'"
-
14
connection.exec(sql)
-
-
14
errmsg = nil
-
14
begin
-
14
collection.find({}, timeout: false) do |cursor|
-
14
cursor.skip(batch_number * BATCH_SIZE).limit(BATCH_SIZE).each do |record|
-
15
record['stats'].each do |stat|
-
15
stats = [timestamp, timestamp, Utilities.prepare_string_for_copy(record['_id']), Utilities.prepare_string_for_copy(stat['title']),
-
Utilities.prepare_string_for_copy(record['name']), Utilities.prepare_integer_for_copy(stat['rank']),
-
Utilities.prepare_integer_for_copy(stat['days_in_top_100']), Utilities.prepare_integer_for_copy(stat['price']),
-
Utilities.prepare_string_for_copy(stat['author']), warehouse_trend_id_by_name[stat['trend']],
-
15
(warehouse_book_version_id_by_tld_and_asin[record['tld']][stat['asin']] || warehouse_book_version_id_by_tld_and_isbn13[record['tld']][stat['isbn']]),
-
record['warehouse_category_id'], warehouse_date_id, Utilities.prepare_string_for_copy(stat['asin']),
-
Utilities.prepare_string_for_copy(stat['isbn']), Utilities.prepare_string_for_copy(stat['bn_id']),
-
Utilities.prepare_string_for_copy(record['itunes_id']), Utilities.prepare_string_for_copy(record['list_type'])]
-
-
15
connection.put_copy_data("#{stats.join(',')}\n")
-
end
-
end
-
end
-
rescue Errno => err
-
errmsg = '%s while reading copy data: %s' % [err.class.name, err.message]
-
end
-
-
14
PostgresUtilities.finalize_copy_command connection, errmsg
-
end
-
-
1
def self.process_date(date_or_date_string, warehouse_date_id)
-
1
date = date_or_date_string.is_a?(Date) ? date_or_date_string : date_or_date_string.to_date
-
1
collections = [MongoUtilities.daily_collection(:amazon_list_stats, date),
-
MongoUtilities.daily_collection(:bn_list_stats, date),
-
MongoUtilities.daily_collection(:apple_list_stats, date)]
-
-
1
params = collections.inject([]) do |memo, collection|
-
3
memo + (collection.count / BATCH_SIZE.to_f).ceil.times.collect do |batch_number|
-
6
[collection.name, warehouse_date_id, batch_number]
-
3
end
-
end
-
-
1
Sidekiq::Client.push_bulk 'class' => EtlWorkers::NightlyListStatEtl, 'args' => params
-
end
-
end
-
-
1
class BookCategoryEtl
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
4
warehouse_category_columns = %w[name category_id tld depth status category_type]
-
-
# [name, category_id, tld]
-
4
amazon_book_category_data = RedisUtilities.get_set_members :scraped_categories
-
12
existing_category_details = WarehouseCategory.amazon.where(name: amazon_book_category_data.collect {|data| data[0]}).value_of(:name, :category_id, :tld)
-
-
4
category_values = (amazon_book_category_data - existing_category_details).select do |new_category_data|
-
6
AmazonBestSellersPage.is_acceptable_amazon_category_name? new_category_data[0]
-
end.collect do |new_category_data|
-
4
[new_category_data[0], new_category_data[1], new_category_data[2], Utilities.get_depth_from_category_name(new_category_data[0]), 'alternative', 'AmazonBookCategory']
-
end
-
-
4
if category_values.present?
-
2
dw_ids = WarehouseCategory.batch_insert warehouse_category_columns, category_values
-
2
Sidekiq::Client.push_bulk('class' => WarehouseCategoryWorkers::FillInWarehouseCategoryParent, 'args' => dw_ids.collect(&method(:Array)))
-
end
-
end
-
end
-
-
1
class BookVersionExceptionEtl
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :etl
-
1
BATCH_SIZE = 100000
-
-
1
def perform(collection_name, batch_number, warehouse_date_id)
-
1
columns = %w[created_at updated_at warehouse_date_id
-
warehouse_book_version_id amazon_not_found_in_search amazon_no_image amazon_no_buy_button amazon_no_price
-
bn_not_found_in_search no_isbn amazon_ambiguous_result apple_invalid]
-
1
timestamp = Time.current.utc
-
1
connection = BookVersionException.connection.raw_connection
-
1
sql = "COPY book_version_exceptions (#{columns.join(',')}) FROM STDIN CSV NULL '' QUOTE '\"' ESCAPE '\\'"
-
1
connection.exec(sql)
-
-
1
errmsg = nil
-
1
collection = $mongodb.collection collection_name
-
1
begin
-
1
collection.find({}, timeout: false) do |cursor|
-
1
cursor.skip(batch_number * BATCH_SIZE).limit(BATCH_SIZE).each do |record|
-
1
stats = [timestamp, timestamp, warehouse_date_id, record['_id'], !!record['amazon_not_found_in_search'],
-
!!record['amazon_no_image'], !!record['amazon_no_buy_button'], !!record['amazon_no_price'],
-
!!record['bn_not_found_in_search'], !!record['no_isbn'], !!record['amazon_ambiguous_result'], !!record['apple_invalid']]
-
-
1
connection.put_copy_data("#{stats.join(',')}\n")
-
end
-
end
-
rescue Errno => err
-
errmsg = '%s while reading copy data: %s' % [err.class.name, err.message]
-
end
-
-
1
PostgresUtilities.finalize_copy_command connection, errmsg
-
end
-
-
1
def self.process_date(date_or_date_string)
-
1
date = date_or_date_string.is_a?(Date) ? date_or_date_string : date_or_date_string.to_date
-
1
warehouse_date_id = WarehouseDate.where(date: date).first_or_create.id
-
1
collection = MongoUtilities.daily_collection(:book_version_exceptions, date)
-
1
params = (collection.count / BATCH_SIZE.to_f).ceil.times.collect do |batch_number|
-
2
[collection.name, batch_number, warehouse_date_id]
-
end
-
-
1
Sidekiq::Client.push_bulk('class' => EtlWorkers::BookVersionExceptionEtl, 'args' => params) if params.present?
-
end
-
end
-
-
1
class CategoryStatsEtl
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :etl
-
-
1
def perform(date_string)
-
date = date_string.to_date
-
warehouse_date_id = WarehouseDate.find_by(date: date).id
-
category_stats_fragments = CategoryStatsCollection.new(date).find.to_a
-
category_stats = category_stats_fragments.pop.except('_id')
-
category_stats_fragments.each do |stats|
-
stats.each_pair do |category_key, hash|
-
next if category_key == '_id'
-
-
if category_stats[category_key].present?
-
category_stats[category_key]['best_rank']
-
if hash['best_rank'] < category_stats[category_key]['best_rank']
-
category_stats[category_key]['best_rank'] = hash['best_rank']
-
category_stats[category_key]['best_rank_book_version_id'] = hash['best_rank_book_version_id']
-
end
-
if hash['worst_rank'] > category_stats[category_key]['worst_rank']
-
category_stats[category_key]['worst_rank'] = hash['worst_rank']
-
category_stats[category_key]['worst_rank_book_version_id'] = hash['worst_rank_book_version_id']
-
end
-
category_stats[category_key]['ranks'] += hash['ranks']
-
else
-
category_stats[category_key] = hash
-
end
-
end
-
end
-
-
columns = %w[best_rank worst_rank book_version_count mean_rank median_rank category_name best_rank_book_version_id
-
worst_rank_book_version_id warehouse_region_id warehouse_date_id warehouse_category_id]
-
-
attributes = category_stats.collect do |_, stats|
-
[stats['best_rank'], stats['worst_rank'], stats['ranks'].count, stats['ranks'].mean, stats['ranks'].median, stats['category_name'],
-
stats['best_rank_book_version_id'], stats['worst_rank_book_version_id'], stats['warehouse_region_id'], warehouse_date_id,
-
stats['warehouse_category_id']]
-
end
-
-
CategoryStat.batch_insert columns, attributes
-
end
-
end
-
end
-
1
module MaintenanceWorkers
-
1
class RunPostgresAnalyze
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform
-
2
Rails.logger.tagged('dw') {Rails.logger.info "Starting postgres ANALYZE process: #{Time.current}"}
-
1
ActiveRecord::Base.connection.uncached do
-
1
ActiveRecord::Base.connection.execute('ANALYZE VERBOSE;')
-
end
-
2
Rails.logger.tagged('dw') {Rails.logger.info "Completed postgres ANALYZE process: #{Time.current}"}
-
end
-
end
-
-
1
class ClearStaleWorkers
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform(queue_names = %w[scraper])
-
1
RedisUtilities.clear_sleeping_and_phantom_workers(queue_names, 1200)
-
end
-
end
-
end
-
1
module MergePurgeJobs
-
1
class ReconcileCSV
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(file_location, user_email)
-
2
Rails.logger.tagged('ingestions') {Rails.logger.info 'Sidekiq worker Reconcile CSV started'}
-
1
csv = File.new("#{Rails.root}/tmp/#{File.basename(file_location)}", 'wb')
-
1
csv << open(file_location).read
-
1
csv.flush
-
-
1
user = User.find_by email: user_email
-
1
MergePurge.reconcile_csv(csv, user)
-
1
File.delete(csv)
-
end
-
end
-
-
1
class ReconcileVookProductionList
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(user_email)
-
Rails.logger.tagged('ingestions') {Rails.logger.info 'Sidekiq worker Reconcile CSV started'}
-
client = Restforce.new :username => 'system@vook.com',
-
:password => 'v0oknation',
-
:security_token => 'kt6KNlNaQluhkbcstMG6UGLn',
-
:client_id => '3MVG9ytVT1SanXDk2W.y22B_aWDxjFj_QyQVOrCscHWaHOCaR8I03EpC6OqBzekjbhNtyO99NOYiBj9ZPEtRS',
-
:client_secret => '7035778425829838587'
-
-
array = []
-
client.query("SELECT ISBN__c, ASIN__c FROM Title__c WHERE Abandoned__c = False AND (AmazonKDP__c = True or AmazonKindle__c = True)
-
AND Account__c NOT IN ('0016000000uvL4Z', '0016000000o4LsM') AND DistributionCompletedDate__c <= #{(Date.today-6.days)} AND RevisingTitle__c = ''").each do |data|
-
array << [data['ISBN__c'], data['ASIN__c']]
-
end
-
-
user = User.find_by email: user_email
-
MergePurge.reconcile_asin_and_isbn13_array_no_metadata(array, user)
-
end
-
end
-
-
-
end
-
1
module MergePurgeWorkers
-
1
class ProcessOnixDeltas
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform
-
2
Rails.logger.tagged('onix') {Rails.logger.info 'Sidekiq worker Reconcile ONIX XML started'}
-
-
1
most_recent_sunday = Date.current.wday == 0 ? Date.current : Date.current - Date.current.wday
-
1
rhinc_user = User.find_by email: 'rhincactive@booklr.com'
-
1
rhde_user = User.find_by email: 'rhde@booklr.com'
-
-
# Physical Delta
-
1
xml_file_name = nil
-
1
Net::FTP.open('ftp.randomhouse.com', 'rhcat', 'rhcat') do |ftp|
-
1
ftp.passive = true
-
1
ftp.chdir '/onix_21/delta'
-
1
if ftp.nlst.last.include?((most_recent_sunday).to_s.gsub('-', ''))
-
1
file_name = ftp.nlst.last
-
1
file = File.open("#{Rails.root}/tmp/#{file_name}", 'w')
-
1
ftp.get(file_name, file)
-
1
xml_file_name = EnterpriseReports.unzip_file(file, "#{Rails.root}/tmp/").first.to_s
-
1
File.delete(file)
-
end
-
end
-
-
# main physical list (with no export edition titles) goes only on the rhinc account
-
1
xml = File.open("#{Rails.root}/tmp/#{xml_file_name}")
-
1
MergePurge.reconcile_onix_xml(xml, rhinc_user)
-
1
File.delete(xml)
-
-
# eBook Deltas
-
1
agency_xml_file_name = nil
-
1
wholesale_xml_file_name = nil
-
1
Net::FTP.open('ftp.randomhouse.com', 'ebookgreen', 'rand0mgr') do |ftp|
-
1
ftp.passive = true
-
-
# Agency
-
1
ftp.chdir '/onix_21/delta/Agency'
-
1
if ftp.nlst.last.include?((most_recent_sunday).to_s.gsub('-', ''))
-
1
file_name = ftp.nlst.last
-
1
file = File.open("#{Rails.root}/tmp/#{file_name}", "w")
-
1
ftp.get(file_name, file)
-
1
agency_xml_file_name = EnterpriseReports.unzip_file(file, "#{Rails.root}/tmp/").first.to_s
-
1
File.delete(file)
-
end
-
-
# Wholesale
-
1
ftp.chdir '/onix_21/delta/Wholesale'
-
1
if ftp.nlst.last.include?((most_recent_sunday).to_s.gsub('-', ''))
-
1
file_name = ftp.nlst.last
-
1
file = File.open("#{Rails.root}/tmp/#{file_name}", 'w')
-
1
ftp.get(file_name, file)
-
1
wholesale_xml_file_name = EnterpriseReports.unzip_file(file, "#{Rails.root}/tmp/").first.to_s
-
1
File.delete(file)
-
end
-
end
-
-
# Ebook deltas go on the rhinc and rhde accounts
-
1
xml = File.open("#{Rails.root}/tmp/#{agency_xml_file_name}")
-
1
MergePurge.reconcile_onix_xml(xml, rhinc_user)
-
-
# reopen file for RHDE processing
-
1
xml = File.open("#{Rails.root}/tmp/#{agency_xml_file_name}")
-
1
MergePurge.reconcile_onix_xml(xml, rhde_user)
-
1
File.delete(xml)
-
-
1
xml = File.open("#{Rails.root}/tmp/#{wholesale_xml_file_name}")
-
1
MergePurge.reconcile_onix_xml(xml, rhinc_user)
-
-
# reopen file for RHDE processing
-
1
xml = File.open("#{Rails.root}/tmp/#{wholesale_xml_file_name}")
-
1
MergePurge.reconcile_onix_xml(xml, rhde_user)
-
1
File.delete(xml)
-
-
# International Physical Delta only goes on the rhde account (this is missing all Canada exclusives included in
-
# the main physical feed but includes the export edition titles)
-
1
xml_file_name = nil
-
1
Net::FTP.open('ftp.randomhouse.com', 'rhintcat', 'RH1ntCat') do |ftp|
-
1
ftp.passive = true
-
1
ftp.chdir '/onix_21/delta'
-
1
if ftp.nlst.last.include?((most_recent_sunday).to_s.gsub("-",""))
-
1
file_name = ftp.nlst.last
-
1
file = File.open("#{Rails.root}/tmp/#{file_name}", "w")
-
1
ftp.get(file_name, file)
-
1
xml_file_name = EnterpriseReports.unzip_file(file,"#{Rails.root}/tmp/").first.to_s
-
1
File.delete(file)
-
end
-
end
-
-
1
xml = File.open("#{Rails.root}/tmp/#{xml_file_name}")
-
1
MergePurge.reconcile_onix_xml(xml, rhde_user)
-
1
File.delete(xml)
-
-
2
Rails.logger.tagged('onix') {Rails.logger.info 'Reconcile ONIX XML completed processing all 3 files'}
-
end
-
end
-
-
1
class UpdateMetadataFromOnix
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform
-
2
Rails.logger.tagged('onix') {Rails.logger.info 'Sidekiq worker Update Metadata from full ONIX XML started'}
-
-
1
most_recent_sunday = Date.current.wday == 0 ? Date.current : Date.current - Date.current.wday
-
-
# Physical Delta
-
1
xml_file_name = nil
-
1
Net::FTP.open('ftp.randomhouse.com', 'rhcat', 'rhcat') do |ftp|
-
1
ftp.passive = true
-
1
ftp.chdir '/onix_21/full'
-
1
if ftp.nlst.last.include?((most_recent_sunday).to_s.gsub('-', ''))
-
1
file_name = ftp.nlst.last
-
1
file = File.open("#{Rails.root}/tmp/#{file_name}", 'w')
-
1
ftp.get(file_name, file)
-
1
xml_file_name = EnterpriseReports.unzip_file(file, "#{Rails.root}/tmp/").first.to_s
-
1
File.delete(file)
-
end
-
end
-
-
1
xml = File.open("#{Rails.root}/tmp/#{xml_file_name}")
-
1
user = User.find_by email: 'rhincactive@booklr.com'
-
1
MergePurge.update_metadata(xml, user)
-
1
File.delete(xml)
-
-
# eBook Deltas
-
1
agency_xml_file_name = nil
-
1
wholesale_xml_file_name = nil
-
1
Net::FTP.open('ftp.randomhouse.com', 'ebookgreen', 'rand0mgr') do |ftp|
-
1
ftp.passive = true
-
-
# Agency
-
1
ftp.chdir '/onix_21/full/Agency'
-
1
if ftp.nlst.last.include?((most_recent_sunday).to_s.gsub('-', ''))
-
1
file_name = ftp.nlst.last
-
1
file = File.open("#{Rails.root}/tmp/#{file_name}", 'w')
-
1
ftp.get(file_name, file)
-
1
agency_xml_file_name = EnterpriseReports.unzip_file(file, "#{Rails.root}/tmp/").first.to_s
-
1
File.delete(file)
-
end
-
-
# Wholesale
-
1
ftp.chdir '/onix_21/full/Wholesale'
-
1
if ftp.nlst.last.include?((most_recent_sunday).to_s.gsub('-', ''))
-
1
file_name = ftp.nlst.last
-
1
file = File.open("#{Rails.root}/tmp/#{file_name}", 'w')
-
1
ftp.get(file_name, file)
-
1
wholesale_xml_file_name = EnterpriseReports.unzip_file(file, "#{Rails.root}/tmp/").first.to_s
-
1
File.delete(file)
-
end
-
end
-
-
1
xml = File.open("#{Rails.root}/tmp/#{agency_xml_file_name}")
-
1
MergePurge.update_metadata(xml, user)
-
1
File.delete(xml)
-
-
1
xml = File.open("#{Rails.root}/tmp/#{wholesale_xml_file_name}")
-
1
MergePurge.update_metadata(xml, user)
-
1
File.delete(xml)
-
-
2
Rails.logger.tagged('onix') {Rails.logger.info 'Sidekiq worker Update Metadata from full ONIX XML completed processing all 3 files'}
-
end
-
end
-
end
-
1
module MongoBookCategoryWorkers
-
1
class AmazonCategoryScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(category_id, category_name, tld, retry_count = 0)
-
11
ProxyUtilities.proxy_setup :amazon
-
-
11
collection = AmazonCategoryCollection.new
-
11
page = AmazonBestSellersPage.by_category_id_and_tld_and_base_category_and_page_number category_id, tld, ScraperUtilities.base_category_from_category_name(category_name), 1
-
11
if handle_captcha(page, 60, category_id, category_name, tld, retry_count)
-
9
category_name_from_page = page.scrape_category_name
-
9
if AmazonBestSellersPage.is_acceptable_amazon_category_name? category_name_from_page
-
8
if category_id.nil? || category_name_from_page == category_name
-
6
collection.add_category_details category_id, category_name, tld, :canonical
-
-
6
page.scrape_subcategories.each do |subcategory_details|
-
12
args = {category_type: 'AmazonBookCategory', category_id: subcategory_details[:category_id],
-
name: subcategory_details[:category_name], status: :alternative, tld: tld}
-
12
Sidekiq::Client.push 'class' => 'WarehouseCategoryWorkers::CreateAmazon', 'queue' => 'high', 'args' => [args]
-
12
MongoBookCategoryWorkers::AmazonCategoryScraper.perform_async subcategory_details[:category_id], subcategory_details[:category_name], tld
-
6
end if page.scrape_subcategories.present?
-
else
-
2
collection.add_category_details category_id, category_name_from_page, tld, :canonical
-
2
collection.add_category_details category_id, category_name, tld, :alternative
-
2
args = {category_type: 'AmazonBookCategory', category_id: category_id, name: category_name_from_page,
-
status: :canonical, tld: tld}
-
2
Sidekiq::Client.push 'class' => 'WarehouseCategoryWorkers::CreateAmazon', 'queue' => 'high', 'args' => [args]
-
end
-
else
-
1
collection.add_category_details category_id, category_name, tld, :deleted
-
end
-
else
-
2
if retry_count < 5
-
1
self.class.perform_async category_id, category_name, tld, retry_count + 1
-
else
-
1
collection.add_category_details category_id, category_name, tld, :deleted
-
end
-
end
-
end
-
end
-
end
-
1
module MongoBookVersionExceptionWorkers
-
1
class DiscoverAmazon404s
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(warehouse_book_version_id, asin, tld)
-
ProxyUtilities.proxy_setup :amazon
-
amazon_page = AmazonProductPage.by_asin_and_tld asin, tld
-
if handle_captcha(amazon_page, 60, warehouse_book_version_id, asin, tld)
-
BookVersionStatusCollection.new.set_book_version_status warehouse_book_version_id, :page_not_found if amazon_page.response_code == '404'
-
end
-
end
-
end
-
-
1
class ValidateAmazonSearch
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(collection_name, warehouse_book_version_id, isbn_or_asin, asin, tld, status, url_hints)
-
6
ProxyUtilities.proxy_setup :amazon
-
6
search_page = AmazonSearchPage.by_isbn_or_asin_and_tld isbn_or_asin, tld
-
6
if handle_captcha(search_page, 60, collection_name, warehouse_book_version_id, isbn_or_asin, asin, tld, status, url_hints)
-
6
exceptions = {amazon_not_found_in_search: Validations.amazon_not_found_in_search?(search_page),
-
amazon_ambiguous_result: Validations.amazon_ambiguous_search_results?(search_page, url_hints)}
-
-
6
if exceptions[:amazon_not_found_in_search] || exceptions[:amazon_ambiguous_result]
-
4
new_status = :invalid_on_amazon
-
else
-
2
new_status = :validated
-
2
MongoBookVersionExceptionWorkers::ValidateAmazonProductPage.perform_async collection_name, warehouse_book_version_id, asin, tld, search_page.matching_url_from_search_results(url_hints)
-
end
-
-
6
MongoUtilities.add_exception_to_collection $mongodb.collection(collection_name), warehouse_book_version_id, exceptions
-
6
BookVersionStatusCollection.new.set_book_version_status warehouse_book_version_id, new_status unless status.to_s == 'ingested'
-
end
-
end
-
end
-
-
1
class ValidateAmazonProductPage
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(collection_name, warehouse_book_version_id, asin, tld, matching_search_url)
-
3
ProxyUtilities.proxy_setup :amazon
-
3
page = asin.present? ? AmazonProductPage.by_asin_and_tld(asin, tld) : AmazonProductPage.new(matching_search_url)
-
-
3
if handle_captcha(page, 60, collection_name, warehouse_book_version_id, asin, tld, matching_search_url)
-
1
exceptions = {amazon_no_price: Validations.amazon_no_price?(page),
-
amazon_no_image: Validations.amazon_no_image?(page),
-
amazon_no_buy_button: Validations.amazon_no_buy_button?(page)}
-
-
1
MongoUtilities.add_exception_to_collection $mongodb.collection(collection_name), warehouse_book_version_id, exceptions
-
end
-
end
-
end
-
-
1
class ValidateBarnesAndNoble
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :bn_stat_scraper
-
-
1
def perform(collection_name, warehouse_book_version_id, isbn_or_asin)
-
1
exception = {no_isbn: Validations.no_isbn?(isbn_or_asin)}
-
1
if exception[:no_isbn]
-
1
exception[:bn_not_found_in_search] = true
-
else
-
ProxyUtilities.proxy_setup :barnes_and_noble
-
page = BnSearchPage.by_isbn isbn_or_asin
-
-
# sleep for 60 seconds and requeue the job if you get throttled
-
if bn_captcha_sleepy_time(page, 60, collection_name, warehouse_book_version_id, isbn_or_asin)
-
exception[:bn_not_found_in_search] = Validations.bn_not_found_in_search?(page)
-
end
-
end
-
-
1
MongoUtilities.add_exception_to_collection $mongodb.collection(collection_name), warehouse_book_version_id, exception
-
end
-
end
-
-
1
class ValidateItunes
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scraper
-
-
1
def perform(collection_name, warehouse_book_version_id, isbn_or_asin)
-
# iTunes Validations: Only validate if the title is ingested, an ebook, has an isbn and is part of rhincactive list
-
1
exceptions = {apple_invalid: Validations.apple_invalid?(isbn_or_asin)}
-
1
MongoUtilities.add_exception_to_collection $mongodb.collection(collection_name), warehouse_book_version_id, exceptions
-
end
-
end
-
end
-
1
module MongoListStatWorkers
-
1
class AmazonTop100
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(category_id, tld, base_category, page_number, warehouse_category_id, collection_name)
-
8
ProxyUtilities.proxy_setup :amazon
-
8
page = AmazonBestSellersPage.by_category_id_and_tld_and_base_category_and_page_number category_id, tld, base_category, page_number
-
8
if handle_captcha(page, 60, category_id, tld, base_category, page_number, warehouse_category_id, collection_name)
-
# Queue up remaining top 100 pages for this category if it's the first page
-
9
(2..page.scrape_number_of_pages).each {|page_num| MongoListStatWorkers::AmazonTop100.perform_async category_id, tld, base_category, page_num, warehouse_category_id, collection_name} if page_number == 1
-
7
mongo_id = "#{warehouse_category_id}-#{page_number}"
-
7
data = {_id: mongo_id, warehouse_category_id: warehouse_category_id, tld: tld, stats: page.best_sellers_stats}
-
15
if data[:stats].blank? || (data[:stats].present? && data[:stats].any? {|stat| stat[:rank].blank?})
-
3
MongoListStatWorkers::AmazonTop100.perform_async category_id, tld, base_category, page_number, warehouse_category_id, collection_name unless page.scrape_no_best_sellers?
-
else
-
10
asins = data[:stats].collect {|stat| stat[:asin]}.uniq
-
9
values = asins.collect {|asin| [{asin: asin, tld: tld, status: :validated_from_top_100s, source: 'amazon-top100'}] unless MongoUtilities.exists_in_all_asin_list?(asin, tld)}.compact
-
4
Sidekiq::Client.push_bulk('class' => 'BookVersionWorkers::Create', 'args' => values, 'queue' => 'background')
-
4
$mongodb.collection(collection_name).insert data
-
end
-
end
-
end
-
end
-
-
1
class BarnesNobleTop100
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :bn_list_stat_scraper
-
-
1
LIST_NAMES = {bn_top_100_nook_books: 'Barnes & Noble Top 100 NOOK Books', bn_nook_book_bestsellers: 'Barnes & Noble NOOK Book Bestsellers'}.freeze
-
-
1
def perform(list_name, collection_name)
-
3
raise ArgumentError unless LIST_NAMES.values.include?(list_name)
-
-
# always use tor for the 2 list scrapes
-
2
ProxyUtilities.force_proxy
-
-
2
stats = send(LIST_NAMES.key(list_name))
-
4
values = stats.collect {|stat| [list_name, stat[:rank], stat[:title], stat[:author], stat[:list_price], stat[:price], stat[:href], collection_name]}
-
2
Sidekiq::Client.push_bulk('class' => MongoListStatWorkers::BarnesNobleFillInIsbnAndBnId, 'args' => values)
-
end
-
-
1
def bn_nook_book_bestsellers
-
6
pages = %w[1 21 41 61 81].collect {|start| HttpHelper.get_bn_list_stat_page_html "http://www.barnesandnoble.com/u/nook-books-bestsellers/379003503?start=#{start}"}
-
1
pages.each_with_object([]) do |page, array|
-
5
page.css('.result').each do |element|
-
100
array << {rank: (element.css('.resultNum').text.squish if element.css('.resultNum').present?),
-
100
title: (element.css('.title a').first.text.squish if element.css('.title a').present?),
-
100
author: (element.css('.contributor a').first.text.squish if element.css('.contributor a').present?),
-
100
list_price: (element.css('.list-price span').first.text.squish.gsub(/\$|\./, '') if element.css('.list-price span').present?), # No longer on page
-
100
price: (element.css('.pricing.bn-price strong').first.text.squish.gsub(/\$|\./, '') if element.css('.pricing.bn-price strong').present?),
-
200
href: (element.css('.title a').first.attributes['href'].text if element.css('.title a').present?)}
-
end
-
end
-
end
-
-
1
def bn_top_100_nook_books
-
6
pages = %w[1 22 43 64 85].collect {|start| HttpHelper.get_bn_list_stat_page_html "http://www.barnesandnoble.com/ebooks/category.asp?PID=35951&start=#{start}"}
-
1
pages.each_with_object([]) do |page, array|
-
5
page.css('.merch-ebook .ebook-info').each do |element|
-
array << {rank: array.size + 1,
-
105
title: (element.css('h4 a').first.text.squish if element.css('h4 a').present?),
-
105
author: (element.css('.Contributor a').first.text.squish if element.css('.Contributor a').present?),
-
105
list_price: (element.css('.memberPriceGroup1 .list-price').first.text.squish.gsub(/\$|\.| List Price/, '') if element.css('.memberPriceGroup1 .list-price').present?),
-
105
price: (element.css('.memberPriceGroup0 strong').first.text.squish.gsub(/\$|\./, '') if element.css('.memberPriceGroup0 strong').present?),
-
210
href: (element.css('h4 a').first.attributes['href'].text if element.css('h4 a').present?)}
-
end
-
end
-
end
-
end
-
-
1
class BarnesNobleFillInIsbnAndBnId
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :bn_list_stat_scraper
-
-
1
def perform(list_name, rank, title, author, list_price, price, url, collection_name)
-
# always use tor
-
4
ProxyUtilities.force_proxy
-
-
4
stats = {rank: rank, title: title, author: author, list_price: list_price, price: price}
-
4
summary_page = HttpHelper.get_html_with_mechanize url, 'Windows IE 6'
-
4
if summary_page.present?
-
3
page = if summary_page.css('.product-details').present? && summary_page.css('.product-details').xpath("//li/span[starts-with(., 'Format')]").present? && summary_page.css('.product-details').xpath("//li/span[starts-with(., 'Format')]").first.parent.children.last.text.squish.downcase == 'ebook'
-
3
summary_page
-
else
-
nook_book_page_url = summary_page.css('.all-formats-editions .format .image a[data-bn-rel]').map {|element| element.attributes['data-bn-rel'].text}.select {|attr| attr.include?('format=nook-book')}.first
-
nook_book_page_url.present? ? HttpHelper.get_html_with_mechanize(nook_book_page_url, 'Windows IE 6') : nil
-
end
-
-
3
if page.present? && page.css('.product-details').present?
-
3
if page.css('.product-details').xpath("//li/span[starts-with(., 'ISBN-13')]").present?
-
2
isbn = page.css('.product-details').xpath("//li/span[starts-with(., 'ISBN-13')]").first.parent.children.last.text.squish
-
2
stats[:isbn] = isbn
-
2
Sidekiq::Client.push_bulk('class' => 'BookVersionWorkers::Create', 'args' => [[{isbn13: isbn, tld: '.com', source: 'bn-top100'}]], 'queue' => 'background')
-
elsif page.css('.product-details').xpath("//li/span[starts-with(., 'BN ID')]").present?
-
1
stats[:bn_id] = page.css('.product-details').xpath("//li/span[starts-with(., 'BN ID')]").first.parent.children.last.text.squish
-
end
-
end
-
end
-
-
4
$mongodb.collection(collection_name).insert({_id: "#{list_name}-#{rank}", name: list_name, tld: '.com', stats: [stats]})
-
end
-
end
-
-
1
class AppleTopBooksFeed
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scraper
-
-
1
def perform(category_id, list_type, warehouse_category_id, collection_name)
-
1
data = {_id: "#{warehouse_category_id}-#{list_type}", warehouse_category_id: warehouse_category_id, tld: '.com', list_type: list_type, stats: AppleTopBooksRssFeed.by_category_id_and_type(category_id, list_type).stats}
-
-
1
$mongodb.collection(collection_name).insert data
-
end
-
end
-
end
-
1
module MongoPromotionPageWorkers
-
1
class AmazonKindleDailyDealsScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform
-
5
ProxyUtilities.proxy_setup :amazon
-
5
page = AmazonKindleDailyDealsPage.new
-
5
if handle_captcha(page, 60)
-
5
deals = page.deals.each {|deal| deal.merge!(promotion_page: 'Amazon Kindle Daily Deals')}
-
-
6
params = page.search_page_daily_deals.collect {|deal| [deal[:url], deal[:daily_deal_type]]}
-
4
Sidekiq::Client.push_bulk('class' => MongoPromotionPageWorkers::AmazonKindleDailyDealsSearchPageScraper, 'args' => params) if params.present?
-
-
4
MongoUtilities.daily_collection(:promotion_pages).insert deals if deals.present?
-
end
-
end
-
end
-
-
1
class AmazonKindleDailyDealsSearchPageScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(url, daily_deal_type)
-
3
ProxyUtilities.proxy_setup :amazon
-
3
page = AmazonSearchPage.new(url)
-
3
if handle_captcha(page, 60, url, daily_deal_type)
-
2
deals = page.search_results.each do |deal|
-
1
deal.merge!(promotion_page: 'Amazon Kindle Daily Deals', daily_deal_type: daily_deal_type)
-
end
-
-
2
MongoUtilities.daily_collection(:promotion_pages).insert deals if deals.present?
-
end
-
end
-
end
-
-
1
class AmazonKindleMonthlyDealsScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform
-
5
ProxyUtilities.proxy_setup :amazon
-
5
page = AmazonKindleMonthlyDealsPage.new
-
5
if handle_captcha(page, 60)
-
5
deals = page.deals.each {|deal| deal.merge!(promotion_page: 'Amazon Kindle Monthly Deals $3.99 or Less')}
-
-
4
params = page.category_urls.collect(&method(:Array))
-
4
Sidekiq::Client.push_bulk('class' => MongoPromotionPageWorkers::AmazonKindleMonthlyDealsCategoryPageScraper, 'args' => params) if params.present?
-
-
4
MongoUtilities.daily_collection(:promotion_pages).insert deals if deals.present?
-
end
-
end
-
end
-
-
1
class AmazonKindleMonthlyDealsCategoryPageScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(url)
-
5
ProxyUtilities.proxy_setup :amazon
-
5
page = AmazonSearchPage.new(url)
-
5
if handle_captcha(page, 60, url)
-
4
deals = page.search_results.each do |deal|
-
1
deal.merge!(promotion_page: 'Amazon Kindle Monthly Deals $3.99 or Less',
-
daily_deal_type: "Category Page - #{deal[:category_name]}")
-
end
-
-
4
if page.scrape_search_page_number == 1 && page.scrape_pagination_urls.present?
-
1
params = page.scrape_pagination_urls.collect(&method(:Array))
-
1
Sidekiq::Client.push_bulk('class' => MongoPromotionPageWorkers::AmazonKindleMonthlyDealsCategoryPageScraper, 'args' => params) if params.present?
-
end
-
-
4
MongoUtilities.daily_collection(:promotion_pages).insert deals if deals.present?
-
end
-
end
-
end
-
-
1
class AmazonKindleSelectPageScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform
-
3
ProxyUtilities.proxy_setup :amazon
-
3
page = AmazonKindleSelectPage.new
-
3
if handle_captcha(page, 60)
-
3
deals = page.all_select_titles.each {|deal| deal.merge!(promotion_page: 'Amazon Kindle Select 25')}
-
2
MongoUtilities.daily_collection(:promotion_pages).insert deals if deals.present?
-
end
-
end
-
end
-
-
1
class BnNookDailyFindPageScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :bn_list_stat_scraper
-
-
1
def perform
-
1
ProxyUtilities.proxy_setup :barnes_and_noble
-
1
page = BnNookDailyFindPage.new
-
1
if bn_captcha_sleepy_time(page, 60)
-
1
deals = [page.scrape_daily_find_book.merge!(promotion_page: 'Barnes & Noble Nook Daily Find')]
-
2
deals += page.scrape_daily_find_carousels.each {|deal| deal.merge!(promotion_page: 'Barnes & Noble Nook Daily Find')}
-
1
MongoUtilities.daily_collection(:promotion_pages).insert deals
-
end
-
end
-
end
-
-
1
class BnNookUnder299PageScraper
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :bn_list_stat_scraper
-
-
1
def perform(start_number)
-
2
ProxyUtilities.proxy_setup :barnes_and_noble
-
2
page = BnNookUnder299.by_start_number(start_number)
-
2
if bn_captcha_sleepy_time(page, 60, start_number)
-
2
deals = page.book_details.each do |deal|
-
1
deal.merge!(promotion_page: 'Barnes & Noble Nook Books Under $2.99')
-
end
-
2
MongoUtilities.daily_collection(:promotion_pages).insert deals if deals.present?
-
end
-
end
-
-
1
def self.queue_all_pages
-
1
params = BnNookUnder299::START_NUMBERS.collect(&method(:Array))
-
1
Sidekiq::Client.push_bulk('class' => MongoPromotionPageWorkers::BnNookUnder299PageScraper, 'args' => params)
-
end
-
end
-
end
-
1
module MongoReportWorkers
-
1
class PromotionReport
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :reporting
-
-
1
def perform(date_string, deliver_email, ftp)
-
date = date_string.to_date
-
-
client_name = :rhinc
-
report_file_name = "rh-promo-report-#{date.strftime('%m%d%y')}"
-
report_hash = EnterpriseReports.generate_report_hash report_file_name, client_name
-
report_csv = EnterpriseReports.open_csv(report_hash)
-
report_csv << ['Promo Page', 'Daily Deal Type', 'Rank', 'Title', 'Author', 'Price', 'ASIN', 'ISBN', 'BN ID']
-
-
collection = MongoUtilities.daily_collection(:promotion_pages, date)
-
collection.find({}, timeout: false) do |cursor|
-
cursor.sort(promotion_page: 1, daily_deal_type: 1, rank: 1).each do |record|
-
report_csv << [record['promotion_page'], record['daily_deal_type'], record['rank'], record['title'], record['author_name'],
-
ReportUtilities.as_price(record['price']), record['asin'], record['isbn'], record['bn_id']]
-
end
-
end
-
-
report_csv.flush
-
EnterpriseReports.move_to_s3(client_name, report_csv)
-
EnterpriseReportsMailer.basic_report(report_hash, AmazeBot.config[:reports][:clients][client_name][:reports][:promotions]).deliver if deliver_email
-
EnterpriseReports.ftp_to_client(client_name, report_csv) if ftp
-
report_csv.close
-
end
-
end
-
end
-
1
module MongoWorkers
-
1
class GetAmazonProductPageStats
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(collection_name, mongo_id, warehouse_book_version_id, asin, ean, tld, itunes_id, book_format)
-
ProxyUtilities.proxy_setup :amazon
-
page = AmazonProductPage.by_asin_and_tld asin, tld
-
if handle_captcha(page, 60, collection_name, mongo_id, warehouse_book_version_id, asin, ean, tld, itunes_id, book_format)
-
stats = StatsScraper.get_amazon_product_page_stats page
-
-
# NOTE: if this instance ever crashes, its port will probably not be consumed again when it restarts which means we'll be doing 2x the threads on
-
# one other port. If we want to avoid this we can rescue all errors here and push the soon to be released port back to the front of the list
-
-
#Get categories from page and add them to the category set in redis in case a new one exists
-
scraped_categories = get_scraped_category_data_from_stats stats, tld
-
RedisUtilities.add_to_set :scraped_categories, scraped_categories if scraped_categories.present?
-
-
$mongodb.collection(collection_name).update({_id: mongo_id},
-
{'$set' => stats.merge!(warehouse_book_version_id: warehouse_book_version_id,
-
asin: asin,
-
ean: ean,
-
tld: tld,
-
itunes: itunes_id,
-
book_format: book_format,
-
amazon_scraped_at: Time.current.utc.to_s)},
-
upsert: true) if stats.present?
-
end
-
end
-
-
1
add_transaction_tracer :perform, :category => :task, :params => '{:collection_name => args[0], :mongo_id => args[1], :warehouse_book_version_id => args[2], :asin => args[3], :ean => args[4], :tld => args[5], :itunes_id => args[6], :book_format => args[7]}'
-
-
# returns an array of arrays [[name1, id1, tld1], [name2, id2, tld2], ...]
-
1
def get_scraped_category_data_from_stats(stats, tld)
-
%w[1 2 3].collect {|num| [stats[:"sub_category#{num}_tree"], stats[:"sub_category#{num}_id"], tld] if stats[:"sub_category#{num}_tree"].present? && stats[:"sub_category#{num}_id"].present?}.compact
-
end
-
end
-
-
1
class GetAmazonAuthorPageStats
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :amazon_author_page_scraper
-
-
1
def perform(collection_name, mongo_ids, tld, author_asin)
-
ProxyUtilities.proxy_setup :amazon
-
author_page = AmazonAuthorPage.by_asin_and_tld(author_asin, tld)
-
if handle_captcha(author_page, 60, collection_name, mongo_ids, tld, author_asin)
-
stats = StatsScraper.get_amazon_author_page_stats author_page
-
-
if stats.present?
-
stats.merge!(amazon_author_page_scraped_at: Time.current.utc.to_s)
-
params = mongo_ids.collect {|mongo_id| [collection_name, mongo_id, stats]}
-
-
Sidekiq::Client.push_bulk 'class' => MongoWorkers::UpdateAmazonAuthorPageStats, 'args' => params
-
end
-
end
-
end
-
end
-
-
1
class UpdateAmazonAuthorPageStats
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :mongo_insert
-
-
1
def perform(collection_name, mongo_id, data)
-
$mongodb.collection(collection_name).update({_id: mongo_id},
-
{'$set' => data}) if data.present?
-
end
-
end
-
-
# TODO set up this job to get the lowest print list price, it'll take refactoring one of the scrapes
-
# class GetAmazonLowestPrintListPrice
-
# include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
# include Sidekiq::Worker
-
# sidekiq_options queue: :captcha_scraper
-
#
-
# def perform(collection_name, mongo_id, warehouse_book_version_id, asin, ean, tld, itunes_id, book_format, author_asin)
-
# # If a page has no list price and is a big 6 publisher title and is a kindle edition then manually
-
# # scrape the related titles for the lowest print list price
-
# if stats[:amazon_list_price].blank?
-
# # Queue up lowest print list price job
-
# stats[:amazon_list_price] = StatsScraper.get_amazon_lowest_print_list_price(page, book_format)
-
# end
-
# end
-
# end
-
-
1
class GetBarnesAndNobleStats
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :bn_stat_scraper
-
-
1
def perform(collection_name, mongo_id, warehouse_book_version_id, asin, ean, tld, itunes_id, book_format, canonical_bn_url = nil)
-
return unless ean.present? && tld == '.com'
-
-
ProxyUtilities.proxy_setup :barnes_and_noble
-
page = canonical_bn_url.present? ? BnBookPage.new(canonical_bn_url) : BnBookPage.by_ean(ean)
-
-
# sleep for 60 seconds and requeue the job if you get throttled
-
if bn_captcha_sleepy_time(page, 60, collection_name, mongo_id, warehouse_book_version_id, asin, ean, tld, itunes_id, book_format, canonical_bn_url)
-
stats = StatsScraper.get_stats_for_ean page
-
-
$mongodb.collection(collection_name).update({_id: mongo_id},
-
{'$set' => stats.merge!(warehouse_book_version_id: warehouse_book_version_id,
-
asin: asin,
-
ean: ean,
-
tld: tld,
-
itunes: itunes_id,
-
book_format: book_format,
-
barnes_and_noble_scraped_at: Time.current.utc.to_s)},
-
upsert: true) if stats.present?
-
end
-
end
-
-
1
add_transaction_tracer :perform, :category => :task, :params => '{:collection_name => args[0], :mongo_id => args[1], :warehouse_book_version_id => args[2], :asin => args[3], :ean => args[4], :tld => args[5], :itunes_id => args[6], :book_format => args[7], :canonical_bn_url => args[8]}'
-
end
-
-
1
class GetItunesStats
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :itunes_stat_scraper
-
-
1
def perform(collection_name, mongo_id, warehouse_book_version_id, asin, ean, tld, itunes_id, book_format)
-
1
return unless itunes_id.present?
-
-
1
stats = StatsScraper.get_stats_for_itunes itunes_id, tld
-
-
1
$mongodb.collection(collection_name).update({_id: mongo_id},
-
{'$set' => stats.merge!(warehouse_book_version_id: warehouse_book_version_id,
-
asin: asin,
-
ean: ean,
-
tld: tld,
-
itunes: itunes_id,
-
book_format: book_format,
-
itunes_scraped_at: Time.current.utc.to_s)},
-
1
upsert: true) if stats.present?
-
end
-
-
1
add_transaction_tracer :perform, :category => :task, :params => '{:collection_name => args[0], :mongo_id => args[1], :warehouse_book_version_id => args[2], :asin => args[3], :ean => args[4], :tld => args[5], :itunes_id => args[6], :book_format => args[7]}'
-
end
-
-
1
class GetGoodreadsStats
-
1
include NewRelic::Agent::Instrumentation::ControllerInstrumentation
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :goodreads_stat_scraper
-
-
1
def perform(collection_name, mongo_id, warehouse_book_version_id, asin, ean, tld, itunes_id, book_format, canonical_goodreads_url = nil)
-
1
return unless (canonical_goodreads_url || ean || asin).present?
-
-
# these 2 books are broken on goodreads and we can't figure out a better way to block these errors from occurring
-
1
return if (warehouse_book_version_id == '532397' || warehouse_book_version_id == '586979')
-
-
1
stats = StatsScraper.get_stats_for_goodreads canonical_goodreads_url, (ean || asin), tld
-
-
1
$mongodb.collection(collection_name).update({_id: mongo_id},
-
{'$set' => stats.merge!(warehouse_book_version_id: warehouse_book_version_id,
-
asin: asin,
-
ean: ean,
-
tld: tld,
-
itunes: itunes_id,
-
book_format: book_format,
-
goodreads_scraped_at: Time.current.utc.to_s)},
-
1
upsert: true) if stats.present?
-
end
-
-
1
add_transaction_tracer :perform, :category => :task, :params => '{:collection_name => args[0], :mongo_id => args[1], :warehouse_book_version_id => args[2], :asin => args[3], :ean => args[4], :tld => args[5], :itunes_id => args[6], :book_format => args[7], :canonical_goodreads_url => args[8]}'
-
end
-
-
1
class GermanCompetitiveCoverage
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(asin, book_format, isbn13, work_id)
-
ProxyUtilities.proxy_setup :amazon
-
tld = '.de'
-
page = AmazonProductPage.by_asin_and_tld asin, tld
-
if handle_captcha(page, 60, asin, book_format, isbn13, work_id)
-
# Use the page we already scraped above to get competitive stats for this asin to push into mongo
-
data = StatsScraper.get_amazon_competitive_stats page
-
-
# Merge asin/isbn13/format in so you know what it is when outputting the report
-
data.merge! book_format: book_format, asin: asin, isbn13: isbn13
-
-
# Collect competitive title asins + determine which is featured and then set that in mongo
-
competitive_details = page.scrape_competitive_related_format_data
-
-
# If its a valid page set the featured asin, WorkID and data for this asin/format and run through each competitive
-
# title and pull its data and put it in mongo.
-
if competitive_details[:valid_page]
-
# Set the WorkID and push this title's data onto the document and push the featured asin for this format onto the featured array
-
MongoUtilities.daily_collection(:de_competitive_format_data, Date.current).update({_id: "#{competitive_details[:parent_asin]}-#{tld}"}, {'$set' => {WorkID: work_id}, '$push' => {rhde_titles: data, featured: competitive_details[:featured_title]}}, upsert: true)
-
-
competitive_details[:competitive_titles].each do |title|
-
MongoWorkers::PopulateGermanCompetitiveData.perform_async title[:asin], competitive_details[:parent_asin], book_format, tld
-
end
-
else
-
# merge in work id and add it to the broken titles document array
-
data.merge! work_id: work_id
-
MongoUtilities.daily_collection(:de_competitive_format_data, Date.current).update({_id: 'broken_titles'}, {'$push' => {titles: data}}, upsert: true)
-
end
-
end
-
end
-
end
-
-
1
class PopulateGermanCompetitiveData
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(asin, parent_asin, book_format, tld)
-
ProxyUtilities.proxy_setup :amazon
-
page = AmazonProductPage.by_asin_and_tld asin, tld
-
if handle_captcha(page, 60, asin, parent_asin, book_format, tld)
-
data = StatsScraper.get_amazon_competitive_stats page
-
-
if data.present?
-
# Merge asin/isbn13/format in so you know what it is when outputting the report
-
data = data.merge! book_format: book_format, asin: asin
-
-
MongoUtilities.daily_collection(:de_competitive_format_data, Date.current).update({_id: "#{parent_asin}-#{tld}"}, {'$push' => {competitive_titles: data}}, upsert: true)
-
end
-
end
-
end
-
end
-
end
-
1
class ReportGeneratorWorker
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :heavy_reporting
-
-
1
def perform(report_date_string, tld, report_card_config)
-
1
report_date = report_date_string.to_date
-
1
report_cards = report_card_config.each_pair.collect do |report_name, parameters|
-
2
klass = ReportCards::ReportCard.report_card_class_by_report_name(report_name)
-
2
klass.new *parameters if klass.present?
-
end.compact
-
2
$redis.set('report_generator_run_time', Benchmark.realtime {ReportGenerator.run(report_date, tld, report_cards: report_cards)}) if report_cards.present?
-
end
-
end
-
1
class Scheduler
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
STAGES = %w[starting scrape_stats etl secondary_etl analyze post_etl status_report wrapup done].freeze
-
-
1
def self.hard_reset_daily_scrape_monitor(date_string)
-
1
MongoUtilities.scheduler_collection.remove _id: date_string
-
end
-
-
1
def perform(date_string)
-
5
@date = date_string.to_date
-
-
5
if @date < Date.current
-
2
Rails.logger.tagged('scheduler') {Rails.logger.info 'FAILED FOR A FULL DAY! GET TO WORK, GUSY!!!1'}
-
1
return
-
end
-
-
4
initialize_scheduler_state
-
4
return if @current_stage == STAGES.last
-
-
3
loop do
-
14
process_stage
-
14
break if transition == STAGES.last
-
end
-
end
-
-
1
def initialize_scheduler_state
-
6
doc = MongoUtilities.scheduler_collection.find(_id: @date.to_s).limit(1).first
-
6
if doc.present?
-
3
@current_stage = doc.fetch('stage', STAGES.first)
-
else
-
3
@current_stage = STAGES.first
-
3
MongoUtilities.scheduler_collection.insert _id: @date.to_s, stage: @current_stage
-
end
-
end
-
-
1
def process_stage
-
10
case @current_stage
-
when 'starting'
-
2
MongoUtilities.scheduler_collection.update({_id: @date.to_s}, {'$set' => {'started_at' => Time.current.to_s}})
-
2
MongoUtilities.daily_collection(:stats).ensure_index({warehouse_book_version_id: 1}, name: 'book_version_id_index')
-
when 'scrape_stats'
-
1
queue_stats
-
when 'etl'
-
2
queue_etl
-
when 'secondary_etl'
-
queue_secondary_etl
-
when 'analyze'
-
1
queue_analyze
-
when 'post_etl'
-
1
queue_post_etl
-
when 'status_report'
-
1
queue_status_report
-
when 'wrapup'
-
1
queue_wrapup
-
1
MongoUtilities.scheduler_collection.update({_id: @date.to_s}, {'$set' => {'completed_at' => Time.current.to_s}})
-
else
-
1
raise 'Uh-oh!'
-
end
-
end
-
-
1
def transition
-
5
return @current_stage if @current_stage == STAGES.last
-
-
3
@current_stage = STAGES[STAGES.index(@current_stage) + 1]
-
3
MongoUtilities.scheduler_collection.update({_id: @date.to_s}, {'$set' => {stage: @current_stage}})
-
-
3
@current_stage
-
end
-
-
1
def queue_stats
-
# TODO: create portlist, we can make the 200 a constant eventually once we settle on this
-
2
RedisUtilities.populate_port_list(200)
-
-
# queue all jobs
-
2
[:amazon_statable, :itunes_statable, :bn_statable].each do |key|
-
6
RedisUtilities.set_count RedisUtilities.get_scrape_count_key(key), 0
-
6
BookVersionWorkers::QueueNightlyScrape.perform_async key
-
end
-
-
# RedisUtilities.set_count RedisUtilities.get_scrape_count_key(:amazon_author_page), 0
-
# BookVersionWorkers::QueueAmazonAuthorPageScrape.perform_async
-
end
-
-
1
def queue_etl
-
4
wait_on_jobs [BookVersionWorkers::QueueNightlyScrape, BookVersionWorkers::QueueNightlyScrapeBlock, BookVersionWorkers::QueueAmazonAuthorPageScrape, BookVersionWorkers::QueueAmazonAuthorPageScrapeBlock], 300, 1
-
-
4
clear_stale_jobs
-
4
EtlWorkers::BookCategoryEtl.perform_async
-
4
MaintenanceWorkers::ClearStaleWorkers.perform_async
-
4
EtlWorkers::QueueNightlyEtl.perform_async
-
end
-
-
1
def queue_secondary_etl
-
2
wait_on_jobs [EtlWorkers::BookCategoryEtl, EtlWorkers::QueueNightlyEtl, EtlWorkers::NightlyStatsEtl, EtlWorkers::NightlyListStatEtl], 300, 1
-
-
2
clear_stale_jobs
-
2
EtlWorkers::CategoryStatsEtl.perform_async Date.current.to_s
-
end
-
-
1
def queue_analyze
-
2
wait_on_jobs [EtlWorkers::CategoryStatsEtl], 300, 1
-
-
2
clear_stale_jobs
-
2
MaintenanceWorkers::RunPostgresAnalyze.perform_async
-
end
-
-
1
def queue_post_etl
-
4
wait_on_jobs [MaintenanceWorkers::RunPostgresAnalyze], 300, 1
-
-
# report config = {email, ftp, gzip, report blocking}
-
4
ReportGeneratorWorker.perform_async Date.current.to_s, '.com', {daily_apple: [true, true, false, true], corporate: [true, true, false, true], corporate2: [true, true, false, true]}
-
-
4
BooklrStatWorkers::CreateBooklrStat.perform_async Date.current.to_s
-
4
EnterpriseReports::DailyReports::RHPG.perform_async Date.current.to_s
-
end
-
-
1
def queue_status_report
-
3
wait_on_jobs [BooklrStatWorkers::CreateBooklrStat, BooklrStatWorkers::SetNumberOfBookVersions, BooklrStatWorkers::SetNumberOfIngestedBookVersions, ReportGeneratorWorker], 300, 1
-
-
3
BackupWorkers::NightlyMongo.perform_async
-
3
BooklrStatWorkers::DailyBooklrStatReport.perform_async
-
end
-
-
1
def queue_wrapup
-
1
wait_on_jobs [BooklrStatWorkers::DailyBooklrStatReport], 300, 1
-
1
$redis.del 'daily_report_stats'
-
-
end
-
-
1
def wait_on_jobs(jobs, time_to_wait, queue_count)
-
14
sleep time_to_wait while Utilities.class_in_sidekiq?(jobs) || Sidekiq::Queue.all.sum(&:size) > queue_count
-
end
-
-
1
def clear_stale_jobs
-
8
%w[bn_stat_scraper optimized_scraper amazon_author_page_scraper itunes_stat_scraper goodreads_stat_scraper].each do |queue|
-
40
RedisUtilities.clear_sleeping_and_phantom_workers(queue, 3000)
-
end
-
end
-
end
-
1
module ScrapeTestWorkers
-
1
class RunScrapeTest
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :captcha_scraper
-
-
1
def perform(page_class, mongo_id, url, user_agent = nil)
-
klass = page_class.constantize
-
page = user_agent.present? ? klass.new(url, user_agent) : klass.new(url)
-
method_names = klass.public_instance_methods(false).select {|method| method.to_s.starts_with?('scrape_')}.select do |scrape_method|
-
page.send(scrape_method).present?
-
end
-
-
if page.ok?
-
if page_class == 'AmazonProductPage' && page.captcha?
-
ScrapeTestWorkers::RunScrapeTest.perform_async page_class, mongo_id, url, user_agent
-
sleep
-
end
-
ScrapeTestWorkers::RunScrapeTest.collection.update({_id: mongo_id},
-
{'$inc' => method_names.each_with_object({}) {|method_name, hash| hash["#{method_name}.count"] = 1}},
-
upsert: true)
-
else
-
ScrapeTestWorkers::RunScrapeTest.collection.update({_id: mongo_id},
-
{'$inc' => {dead_page: 1}},
-
upsert: true)
-
end
-
end
-
-
1
def self.collection
-
$mongodb["scrape_tests_#{Date.current.to_s.underscore}"]
-
end
-
-
1
def self.queue
-
amazon_com_count = (WarehouseBookVersion.amazon_statable.com.count * 0.01).ceil
-
amazon_couk_count = (WarehouseBookVersion.amazon_statable.couk.count * 0.5).ceil
-
bn_count = (WarehouseBookVersion.bn_statable.count * 0.01).ceil
-
values = []
-
-
mongo_id = "#{AmazonProductPage.to_s.underscore}_com"
-
WarehouseBookVersion.amazon_statable.com.value_of(:asin).sample(amazon_com_count).each do |asin|
-
values << [AmazonProductPage.to_s, mongo_id, Urls.amazon_book_page(asin, '.com')]
-
end
-
collection.update({_id: mongo_id}, {'$set' => {total_scrapes: amazon_com_count}}, upsert: true)
-
-
mongo_id = "#{AmazonProductPage.to_s.underscore}_couk"
-
WarehouseBookVersion.amazon_statable.couk.value_of(:asin).sample(amazon_couk_count).each do |asin|
-
values << [AmazonProductPage.to_s, mongo_id, Urls.amazon_book_page(asin, '.couk')]
-
end
-
collection.update({_id: mongo_id}, {'$set' => {total_scrapes: amazon_couk_count}}, upsert: true)
-
-
mongo_id = BnBookPage.to_s.underscore
-
WarehouseBookVersion.bn_statable.value_of(:asin).sample(bn_count).each do |ean|
-
values << [BnBookPage.to_s, mongo_id, Urls.bn_book_page(ean)]
-
end
-
collection.update({_id: mongo_id}, {'$set' => {total_scrapes: bn_count}}, upsert: true)
-
-
Sidekiq::Client.push_bulk('class' => ScrapeTestWorkers::RunScrapeTest, 'args' => values)
-
end
-
end
-
-
1
class DescriptionCoverage
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scraper
-
1
def perform(asin, tld)
-
3
page = AmazonProductPage.by_asin_and_tld asin, tld
-
-
3
if page.ok?
-
2
desc = page.scrape_amazon_description
-
#if description successful, incr descSuccess by 1, or set 1 if key not found
-
2
if desc.present?
-
1
$redis.incr('desc_success')
-
#max character counter
-
1
$redis.sadd('desc_length_counts',desc.size)
-
end
-
end
-
end
-
end
-
-
#class BnScrapeTest
-
# include Sidekiq::Worker
-
# sidekiq_options queue: :spider
-
#
-
# def perform(isbn13)
-
# agent = Mechanize.new
-
# agent.user_agent_alias = 'Windows IE 6'
-
# page = agent.get(Urls.bn_book_page(isbn13)).parser
-
#
-
# if page.present?
-
# $redis.sadd('present_pages', isbn13)
-
# BnBookPageScraper.get_price(page).present? ? $redis.sadd('present_pages_with_price', isbn13) : $redis.sadd('present_pages_no_price', isbn13)
-
# else
-
# $redis.sadd('null_pages', isbn13)
-
# end
-
# end
-
#end
-
-
#class BnScrapeWithRescueTest
-
# include Sidekiq::Worker
-
# sidekiq_options queue: :spider
-
#
-
# def perform(isbn13_or_url)
-
# isbn13 = isbn13_or_url.length == 13 ? isbn13_or_url : isbn13_or_url.split('isbn=').last
-
# begin
-
# agent = Mechanize.new
-
# agent.user_agent_alias = 'Mac FireFox'
-
# page = isbn13_or_url.length == 13 ? agent.get(Urls.bn_book_page(isbn13)).parser : agent.get(isbn13_or_url).parser
-
#
-
# if page.present?
-
# $redis.sadd('present_pages', isbn13)
-
# BnBookPageScraper.get_price(page).present? ? $redis.sadd('present_pages_with_price', isbn13) : $redis.sadd('present_pages_no_price', isbn13)
-
# else
-
# $redis.sadd('null_pages', isbn13)
-
# end
-
# rescue *HTTP_ERRORS => e
-
# $redis.sadd('null_pages', isbn13)
-
# $redis.incr(e.class.to_s)
-
# Rails.logger.tagged('httperror') {Rails.logger.info "#{isbn13} - http error: #{e}"}
-
# end
-
# end
-
#end
-
end
-
1
module SpiderWorkers
-
1
class QueueCategorySpidering
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform(base_category_name, recent, pages_to_scrape)
-
2
Rails.logger.tagged('spidering') {Rails.logger.info "'#{base_category_name}' Scrape Started - recent check enabled?: #{recent}"}
-
1
params = WarehouseCategory.amazon.canonical.where("name LIKE '#{base_category_name}'").value_of(:category_id).each_with_object([]) do |category_id, array|
-
1
(1..pages_to_scrape).collect do |count|
-
3
array << [count, category_id, recent]
-
end
-
end
-
-
1
params.each_slice(12000) do |value_slice|
-
1
Sidekiq::Client.push_bulk('class' => SpiderWorkers::CollectAsins, 'args' => value_slice)
-
end
-
2
Rails.logger.tagged('spidering') {Rails.logger.info "'#{base_category_name}' Scrapes Finished Queueing - recent check enabled?: #{recent}"}
-
end
-
end
-
-
1
class CollectAsins
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
1
def perform(page_num, category_id, recent)
-
5
ProxyUtilities.proxy_setup :amazon
-
5
uri_str = "http://www.amazon.com/s?ie=UTF8&page=#{page_num}&rh=n:#{category_id}"
-
5
uri_str += ',p_n_date:1249100011' if recent
-
5
page = AmazonSearchPage.new uri_str
-
5
if handle_captcha(page, 60, page_num, category_id, recent)
-
4
spidered_asins = page.scrape_search_result_asins
-
-
# clean out anything that isn't a valid asin or already exists on our master list
-
4
if spidered_asins.present?
-
6
spidered_asins = spidered_asins.reject {|asin| Utilities.determine_key_type(asin) == nil}
-
5
asins_to_add = spidered_asins.reject {|asin| MongoUtilities.exists_in_all_asin_list? asin, '.com'}
-
-
# add the rest to the redis list which will get consumed later
-
3
$redis.sadd('new_asins', asins_to_add) if asins_to_add.present?
-
end
-
end
-
end
-
end
-
-
1
class QueueValidateAndIngestAsins
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
# Queues validation and ingestion jobs for the spidered asins that result from QueueCategorySpidering
-
# if count is enabled it will set a flag that the queued jobs just do counts
-
# if check_asins is true it will make sure not to validate any books we already have
-
1
def perform(format, check_asins = true, count = false)
-
6
Rails.logger.tagged('spidering') {Rails.logger.info "#{format} validation and ingestion started on #{$redis.scard('new_asins')} new asins"}
-
-
# get all asins scraped from spidering and subtract the existing asins from this list to get the uningested list
-
3
asins = $redis.smembers('new_asins')
-
3
asins_uningested = check_asins ? asins - WarehouseBookVersion.com.where(asin: asins).value_of(:asin) : asins
-
-
#subtract the uningested ones from the redis list to determine which ones to remove from redis
-
3
to_remove = asins - asins_uningested
-
3
$redis.srem('new_asins', to_remove) if check_asins && to_remove.present?
-
-
5
asins_uningested.collect {|asin| [asin, format, count]}.each_slice(RedisUtilities::BLOCK_SIZE) do |value_slice|
-
2
Sidekiq::Client.push_bulk('class' => SpiderWorkers::ValidateAndIngestAsins, 'args' => value_slice)
-
end
-
6
Rails.logger.tagged('spidering') {Rails.logger.info "#{format} validation and ingestion finished queuing, #{$redis.scard('new_asins')} new asins remain"}
-
end
-
end
-
-
1
class ValidateAndIngestAsins
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :optimized_scraper
-
-
# validate that each book is valid and below 2m rank before ingesting (or increasing count)
-
1
def perform(asin, format, count = false)
-
7
ProxyUtilities.proxy_setup :amazon
-
7
tld = '.com'
-
-
7
amazon_page = AmazonProductPage.new Urls.amazon_book_page(asin, tld), 'Windows IE 6'
-
7
if handle_captcha(amazon_page, 60, asin, format, count)
-
6
rank = amazon_page.scrape_sales_rank
-
6
scraped_format = amazon_page.scrape_book_format
-
-
# if a asin passes create a new book version and remove it from the redis set
-
# else remove it from the redis set and add it to the overflow set
-
# if count is set just incr counts and dont do any real work
-
6
if scraped_format.present? && scraped_format.include?(format) && rank.to_i < 2000000
-
2
if count
-
1
$redis.incr('valid_asins')
-
else
-
1
Sidekiq::Client.push('queue' => 'background', 'class' => 'BookVersionWorkers::Create', 'args' => [{asin: asin, book_format: scraped_format, source: 'amazon-spidering', status: 'validated', tld: tld}])
-
1
$redis.srem 'new_asins', asin
-
end
-
else
-
4
if count
-
2
$redis.incr('invalid_asins')
-
else
-
2
$redis.srem 'new_asins', asin
-
2
$redis.sadd 'filtered_asins', asin
-
end
-
end
-
end
-
end
-
end
-
end
-
1
module WarehouseCategoryWorkers
-
1
class CreateAmazon
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform(params)
-
2
params = params.with_indifferent_access
-
2
unless WarehouseCategory.amazon.where(name: params[:name], category_id: params[:category_id], tld: params[:tld]).exists?
-
1
params[:depth] = Utilities.get_depth_from_category_name params[:name]
-
1
params[:parent_id] = WarehouseCategory.get_parent_id_for_category_name params[:name], params[:tld]
-
1
WarehouseCategory.amazon.create! params
-
end
-
end
-
end
-
-
1
class QueueScrapes
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :scheduling
-
-
1
def perform
-
2
AmazonCategoryCollection.new.prepare_for_scraping
-
-
2
params = WarehouseCategory.amazon.where(name: AmazonBestSellersPage::ACCEPTABLE_BASE_CATEGORIES).value_of(:category_id, :name, :tld).collect do |category_id, name, tld|
-
6
[category_id, name, tld]
-
end
-
2
Sidekiq::Client.push_bulk 'class' => MongoBookCategoryWorkers::AmazonCategoryScraper, 'args' => params
-
end
-
end
-
-
1
class UpdateStatusAmazon
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :sync
-
-
1
def perform
-
5
arel_table = WarehouseCategory.arel_table
-
5
scraped_category_data = AmazonCategoryCollection.new.find.to_a
-
-
# Group scraped category data by status
-
5
scraped_category_data_by_status = scraped_category_data.each_with_object({alternative: [], deleted: [], canonical: []}.with_indifferent_access) do |data, hash|
-
9
hash[data['status']] << [data['category_id'], data['category_name'], data['tld']]
-
end
-
-
# Add categories found in the DW but not in the category collection to the deleted group
-
10
dw_category_data = WarehouseCategory.amazon.where{status != 'deleted'}.value_of(:category_id, :name, :tld)
-
14
data_to_match = scraped_category_data.collect {|data| [data['category_id'], data['category_name'], data['tld']]}
-
5
(dw_category_data - data_to_match).each do |data|
-
1
scraped_category_data_by_status[:deleted] << data
-
end
-
-
# Update statuses
-
5
scraped_category_data_by_status.each_pair do |status, data|
-
15
conditions = data.collect do |category_id, name, tld|
-
10
arel_table[:category_id].eq(category_id).and(arel_table[:name].eq(name)).and(arel_table[:tld].eq(tld))
-
end
-
# Need to break up the update call into manageable slices for the DB
-
15
conditions.each_slice(100) do |condition_slice|
-
10
final_condition = condition_slice.reduce {|intermediate, condition| intermediate.or(condition)}
-
7
timestamp = Time.current
-
7
updated_fields = {status: status, updated_at: timestamp}
-
7
updated_fields[:canonical_category_id] = nil if status == 'canonical' # Remove canonical category association for any canonical category
-
7
WarehouseCategory.amazon.where(final_condition).update_all(updated_fields)
-
end
-
end
-
-
# Update all alternative and deleted categories with their canonical_category
-
10
non_canonical_categories = WarehouseCategory.amazon.where{(status == 'alternative') | (status == 'deleted')}.includes(:canonical_category)
-
15
canonical_category_ids_by_category_id_and_tld = WarehouseCategory.amazon.canonical.where(category_id: non_canonical_categories.collect(&:category_id)).each_with_object(Hash[Utilities::TLDS.zip(Array.new(Utilities::TLDS.count) {Hash.new})]) do |category, hash|
-
3
hash[category.tld][category.category_id] = category.id
-
end
-
5
non_canonical_categories.each do |category|
-
7
if category.canonical_category.blank? || category.canonical_category.category_id != category.category_id
-
7
canonical_category_id = canonical_category_ids_by_category_id_and_tld[category.tld][category.category_id]
-
7
if canonical_category_id.present?
-
2
category.canonical_category_id = canonical_category_id
-
2
category.save
-
end
-
end
-
end
-
end
-
end
-
-
1
class FillInWarehouseCategoryParent
-
1
include Sidekiq::Worker
-
1
sidekiq_options queue: :high
-
-
1
def perform(warehouse_category_id)
-
1
warehouse_category = WarehouseCategory.find warehouse_category_id
-
1
warehouse_category.set_parent_id
-
1
warehouse_category.save!
-
end
-
end
-
-
#TODO Needs to be redone very slightly, BarnesNobleBookCategory no longer exists
-
#class FindNewTitles
-
# include Sidekiq::Worker
-
# sidekiq_options queue: :background
-
#
-
# def perform(bn_book_category_id)
-
# ActiveRecord::Base.connection.uncached do
-
# bn_book_category = BarnesNobleBookCategory.find bn_book_category_id
-
# book_formats = bn_book_category.physical_category? ? %w[Hardcover Paperback] : ["NOOK Book"]
-
#
-
# (1..11).each do |page_number|
-
# book_formats.each do |book_format|
-
# page = BnCategoryPageScraper.get_page bn_book_category.category_id, book_format, page_number
-
#
-
# ean_list = []
-
#
-
# page.css("li.result").each do |li|
-
# div = li.css("div")
-
# ean = div.first["data-bn-ean"]
-
# format = div.css("div.price-format > a > span.format").text
-
# ean_list << ean if ISBN_Tools.is_valid_isbn13?(ean) && format.include?(book_format)
-
# end
-
#
-
# existing_isbns = BookVersion.where(isbn13: ean_list, tld: '.com').collect(&:isbn13)
-
# creations = ean_list - existing_isbns
-
#
-
# # Turn this into a batch insert if we want to optimize, persist format as it is unless its Nook then use Kindle Edition
-
# book_format_persist = book_format.include?("NOOK Book") ? "Kindle Edition" : book_format
-
# count = $redis.get("creation_count").to_i
-
# $redis.set("creation_count",count+creations.count) unless creations.blank?
-
# #creations.each {|ean| BookVersion.create(isbn13: ean, source: 'bn-category-spidering', book_format: book_format_persist)}
-
# end
-
# end
-
# end
-
# end
-
#end
-
-
# TODO: BROKEN, calls AmazonBookCategory, wtf?
-
#class CreateSubcategories
-
# include Sidekiq::Worker
-
# sidekiq_options queue: :high
-
#
-
# def perform(bn_book_category_id)
-
# ActiveRecord::Base.connection.uncached do
-
# bn_book_category = BarnesNobleBookCategory.find bn_book_category_id
-
# page = BnCategoryPageScraper.get_page bn_book_category.category_id, nil, nil
-
#
-
# # Check if sub categories exist before we try to create categories
-
# if page.css('.search-filter ul.filter-value-set').present? && page.css('.search-filter div.filter-name').first.text.include?("In ")
-
# # first ul.filter-value-set is the category block, rest are other aref filters
-
# page.css('.search-filter ul.filter-value-set').first.css('li > a').each do |element|
-
# category_id = URI.unescape(element['href']).split('dref=').last
-
# category_name = element.text.strip
-
# depth = category_id.count ","
-
# bn_book_category = BarnesNobleBookCategory.where(category_id: category_id, parent_id: bn_book_category_id, name: category_name, depth: depth).first_or_create unless category_id.blank? || AmazonBookCategory.where(category_id: category_id).exists?
-
# BarnesNobleListStatWorkers::CreateSubcategories.perform_async bn_book_category.id
-
# end
-
# end
-
# end
-
# end
-
#end
-
end
-
1
module AmazeBot
-
1
mattr_accessor :config
-
1
@@config = {}
-
end
-
1
module Bitmaskable
-
1
class Definition
-
1
attr_reader :attribute, :values, :extension
-
1
def initialize(attribute, values = [], options = {})
-
1
@attribute = attribute
-
1
@values = values
-
1
@options = options
-
end
-
-
1
def install_on(model)
-
1
override model
-
end
-
-
#######
-
1
private
-
#######
-
-
1
def override(model)
-
# override getter
-
1
model.class_eval %(
-
def #{@options[:as] || attribute}
-
#{@values}.reject {|r| ((#{attribute} || 0) & (1 << #{@values}.index(r))).zero? }
-
end
-
)
-
-
# override setter
-
1
model.class_eval %(
-
def #{@options[:as] || attribute}=(roles)
-
string_roles = Array.wrap(roles).map {|t| t.to_s.downcase}
-
send :write_attribute, :#{attribute}, ((string_roles & #{@values}).map { |r| 1 << #{@values}.index(r) }.sum)
-
end
-
)
-
end
-
end
-
-
1
module ClassMethods
-
1
def bitmask(attribute, options={})
-
1
raise ArgumentError, "Must provide an Array :as option" unless options[:values] && options[:values].kind_of?(Array)
-
-
1
bitmask_definitions[attribute] = Bitmaskable::Definition.new(attribute, options.delete(:values).to_a, options)
-
1
bitmask_definitions[attribute].install_on(self)
-
end
-
-
1
def bitmask_definitions
-
2
@bitmask_definitions ||= {}
-
end
-
end
-
end
-
-
1
ActiveRecord::Base.extend Bitmaskable::ClassMethods
-
1
module ActiveRecord
-
1
class Base
-
1
attr_accessible
-
-
1
def self.value_of(*args)
-
2062
columns = args.each_with_object({}) {|arg, hash| hash[arg] = columns_hash[arg.to_s]}
-
1386
values = connection.execute(connection.unprepared_statement {select(args).to_sql}).collect do |res|
-
807
args.collect {|arg| columns[arg].type_cast(res[arg.to_s])}
-
end
-
-
693
args.count == 1 ? values.flatten : values
-
end
-
-
1
def self.batch_insert(column_names, array_of_attributes)
-
72
unless column_names.all? {|column_name| self.column_names.include? column_name}
-
3
raise ArgumentError.new("Column(s) provided nonexistent: #{column_names.collect {|column_name| column_name unless self.column_names.include? column_name}.compact.join(', ')}")
-
end
-
39
raise ArgumentError.new('Wrong number of values to insert') unless array_of_attributes.all? {|attributes| attributes.size == column_names.size}
-
16
local_column_names = Marshal.load(Marshal.dump(column_names))
-
16
local_array_of_attributes = Marshal.load(Marshal.dump(array_of_attributes))
-
16
if columns_hash['type'].present? && !local_column_names.include?('type')
-
local_column_names << 'type'
-
local_array_of_attributes.each {|attributes| attributes << to_s}
-
end
-
16
timestamp = Time.current
-
16
if columns_hash['created_at'].present? && !local_column_names.include?('created_at')
-
7
local_column_names << 'created_at'
-
18
local_array_of_attributes.each {|attributes| attributes << timestamp}
-
end
-
16
if columns_hash['updated_at'].present? && !local_column_names.include?('updated_at')
-
7
local_column_names << 'updated_at'
-
18
local_array_of_attributes.each {|attributes| attributes << timestamp}
-
end
-
81
columns = local_column_names.map {|name| columns_hash[name.to_s]}
-
81
columns_sql = "(#{local_column_names.map{|name| connection.quote_column_name(name) }.join(',')})"
-
16
insert_sql = "INSERT INTO #{quoted_table_name} #{columns_sql} VALUES "
-
16
values_sql = local_array_of_attributes.map do |arr|
-
20
my_values = arr.each_with_index.map do |val,j|
-
82
column = columns[j]
-
82
(!sequence_name.blank? && column.name == primary_key && val.nil?) ? %{#{sequence_name}.nextval} : connection.quote(column.type_cast(val), column)
-
end
-
20
"(#{my_values.join(',')})"
-
end
-
16
sql2insert = insert_sql + values_sql.join( ',' )
-
16
sql, binds = connection.sql_for_insert(connection.to_sql(sql2insert), nil, nil, nil, [])
-
16
res = connection.exec_insert sql, 'Batch Insert', binds
-
16
res.rows.flatten.collect(&:to_i)
-
end
-
-
1
def self.outer_joins(*args)
-
44
joins(args.flatten.collect {|association_name| generate_join(Arel::Nodes::OuterJoin, association_name)})
-
end
-
-
1
def self.join_select(join_type, auto_transform, join_selects)
-
17
throw ArgumentError unless %w[inner outer].include?(join_type.to_s)
-
17
throw ArgumentError unless join_selects.is_a?(Array) || join_selects.is_a?(Hash)
-
-
17
select(join_selects.collect do |association_name, fields_or_transforms|
-
38
table = reflect_on_association(association_name).klass.arel_table
-
38
table = table.alias(association_name) if join_type.to_s == 'outer'
-
38
fields_or_transforms.collect do |field_or_transform|
-
229
field, transform = field_or_transform
-
229
transform ||= {}
-
229
transform.reverse_merge!(as: "#{association_name}_#{field}")
-
229
if auto_transform
-
1
transform.reverse_merge! transform_name: :price if field.to_s.include?('price')
-
end
-
229
transform_select_statement table, field, transform
-
end
-
end.flatten!)
-
end
-
-
1
def self.transforming_select(auto_transform, fields_or_transforms = {})
-
8
throw ArgumentError unless fields_or_transforms.is_a?(Array) || fields_or_transforms.is_a?(Hash)
-
-
8
select(fields_or_transforms.collect do |field_or_transform|
-
137
field, transform = field_or_transform
-
137
transform ||= {}
-
137
if auto_transform
-
132
transform.reverse_merge! transform_name: :price if field.to_s.include?('price')
-
end
-
137
transform_select_statement arel_table, field, transform
-
end)
-
end
-
-
1
def self.single_query_join_and_select(base_model_select_statements, inner_joins, outer_joins)
-
15
joins = []
-
15
select_statements = base_model_select_statements == '*' ? [arel_table[Arel.star]] : Array.wrap(base_model_select_statements.try(:dup))
-
-
inner_joins.each_pair do |association_name, select_values|
-
23
joins << generate_join(Arel::Nodes::InnerJoin, association_name)
-
23
select_statements << generate_select(select_values, association_name) unless select_values.blank?
-
15
end if inner_joins.present?
-
-
outer_joins.each_pair do |association_name, select_values|
-
40
joins << generate_join(Arel::Nodes::OuterJoin, association_name)
-
40
select_statements << generate_select(select_values, association_name) unless select_values.blank?
-
15
end if outer_joins.present?
-
-
15
select(select_statements).joins(joins)
-
end
-
-
1
private
-
-
1
def self.generate_join(join_type, association_name)
-
96
table = arel_table
-
96
reflection = reflect_on_association(association_name)
-
96
join_table = reflection.klass.arel_table.alias(association_name)
-
96
condition = reflection.belongs_to? ? join_table[reflection.active_record_primary_key].eq(table[reflection.foreign_key]) : join_table[reflection.foreign_key].eq(table[reflection.active_record_primary_key])
-
96
table.create_join(join_table, table.create_on(condition), join_type)
-
end
-
-
1
def self.transform_select_statement(table, field_name, transform_details)
-
366
return table[field_name] if transform_details.blank?
-
-
250
case transform_details[:transform_name]
-
when :price
-
22
Arel::Nodes::NamedFunction.new('trunc', [table[field_name] / 100.0, 2])
-
else
-
228
table[field_name]
-
end.as Arel.sql("\"#{(transform_details[:as].presence || field_name)}\"")
-
end
-
-
1
def self.generate_select(select_values, association_name)
-
63
return if select_values.blank?
-
-
63
reflection = reflect_on_association(association_name)
-
63
join_table = reflection.klass.arel_table.alias(association_name)
-
63
if select_values.is_a? String
-
select_values
-
126
elsif select_values.all? {|select_value| select_value.starts_with? '!'}
-
ignored_columns = select_values.map {|select_value| select_value[1..-1]}
-
reflection.klass.column_names.collect {|column_name| join_table[column_name].as "#{association_name}_#{column_name}" unless ignored_columns.include? column_name}.compact
-
else
-
183
select_values.collect {|column_name| join_table[column_name].as "#{association_name}_#{column_name}"}
-
end
-
end
-
end
-
end
-
# Add methods to Enumerable, which makes them available to Array
-
1
class Array
-
1
def mean
-
11
sum / count.to_f
-
end
-
-
1
def median
-
4
index = size == 0 ? 0 : ((size - 1) / 2.0)
-
4
sort[index.floor..index.ceil].mean
-
end
-
-
# returns a hash of the frequencies of each unique element in the array
-
1
def frequencies
-
18
each_with_object(Hash.new(0)) {|element, hash| hash[element] += 1}
-
end
-
-
# Returns an array of all modes
-
1
def mode
-
2
freq = frequencies
-
2
max_count = freq.values.max
-
11
freq.select {|_, count| count == max_count}.keys.sort
-
end
-
-
1
def sample_variance
-
2
avg = mean
-
12
count > 1 ? inject(0){|accum, i| accum + (i - avg) ** 2 } / (count - 1).to_f : 0.0
-
end
-
-
1
def standard_deviation
-
1
Math.sqrt(sample_variance)
-
end
-
-
# Calculates the moving average given an integer as a increment period
-
1
def moving_average(span = 1)
-
1
each_cons(span).collect(&:mean)
-
end
-
end
-
1
class Date
-
1
def self.parse_international(string)
-
1
parse(month_to_english(string))
-
end
-
-
1
private
-
-
1
def self.make_hash(names)
-
4
Hash[([nil]+names).zip(MONTHNAMES)]
-
end
-
-
1
MONTH_TRANSLATIONS = {}
-
1
MONTH_TRANSLATIONS.merge! make_hash(%w/janvier février mars avril mai juin juillet août septembre octobre novembre décembre/) # French
-
1
MONTH_TRANSLATIONS.merge! make_hash(%w/januar februar märz april mai juni juli august september oktober november dezember/) # German
-
1
MONTH_TRANSLATIONS.merge! make_hash(%w/gennaio febbraio marzo aprile maggio giugno luglio agosto settembre ottobre novembre dicembre/) # Italian
-
1
MONTH_TRANSLATIONS.merge! make_hash(%w/enero febrero marzo abril mayo junio julio agosto septiembre octubre noviembre diciembre/) # Spanish
-
1
MONTH_TRANSLATIONS.freeze
-
-
1
def self.month_to_english(string)
-
1
month_from = string[/[^\s\d,]+/i] # Search for a month name
-
1
if month_from
-
1
month_to = MONTH_TRANSLATIONS[month_from.downcase] # Look up the translation
-
1
return string.sub(month_from, month_to.to_s) if month_to
-
end
-
return string
-
end
-
end
-
1
module HerokuMongoBackup
-
1
def self.load_from_file(backup_location, block_size = 1250)
-
1
count = 1
-
-
1
open("#{File.basename(backup_location)}", 'wb') do |file|
-
1
file << open(backup_location,:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read
-
end
-
-
# Iterate through the gzipped backup yielding records by iterating per separator !-!
-
1
Zlib::GzipReader.open("#{File.basename(backup_location)}").each('!-!') do |record|
-
# Unmarshal the string
-
2
record_block = Marshal.load(record)
-
2
collection = record_block.keys.first
-
-
4
Rails.logger.tagged('backup') {Rails.logger.info "Restoring from #{collection}: block #{(count-1)*block_size} - #{count*block_size}"}
-
-
2
record_block[collection].each_slice(block_size) do |slice|
-
# append -restored to any collection you insert to so you don't clobber the existing data if it still exists
-
4
$mongodb.collection(collection+'-restored').insert slice
-
end
-
-
2
count += 1
-
end
-
end
-
-
1
class Backup
-
1
def store(collections, block_size)
-
1
file = File.new(@file_name, 'wb')
-
1
file = Zlib::GzipWriter.new(file)
-
-
1
collections.each do |col|
-
2
Rails.logger.tagged('backup') {Rails.logger.info "Backing up collection: #{col.name}"}
-
-
1
col.find({}, timeout: false) do |cursor|
-
1
total = col.count
-
1
i = 1
-
1
cursor.each_slice(block_size) do |slice|
-
4
Rails.logger.tagged('backup') {Rails.logger.info "#{i*block_size} out of #{total}"}
-
2
i += 1
-
2
backup = {}
-
-
# Add each block of records to a hash associated with the key of the collection
-
2
backup[col.name] = slice
-
-
2
file.print Marshal.dump(backup)
-
2
file.print '!-!'
-
end
-
end
-
end
-
-
# cleanup file
-
1
file.close
-
end
-
-
1
def db_connect
-
7
@db = $mongodb
-
end
-
-
1
def initialize date = Date.current, connect = true
-
8
@date = date
-
# date-of-data--date-created|time-created.gz
-
# 2014-02-11--2014-02-14|12:16:19.gz
-
8
@file_name = Time.current.strftime("#{@date.to_s}.gz")
-
8
@dirname = 'backups'
-
-
8
self.db_connect
-
8
self.s3_connect if connect
-
end
-
-
1
def backup(collections, block_size = 1000)
-
#self.chdir
-
2
self.store(collections, block_size)
-
2
self.s3_upload
-
end
-
-
1
def s3_upload
-
1
HerokuMongoBackup::s3_upload(@bucket, @dirname, @file_name)
-
end
-
-
1
def s3_connect
-
8
@bucket = HerokuMongoBackup::s3_connect(AmazeBot.config[:carrier_wave][:fog_directory][Utilities.env],
-
AmazeBot.config[:carrier_wave][:fog_credentials][:aws_access_key_id],
-
AmazeBot.config[:carrier_wave][:fog_credentials][:aws_secret_access_key])
-
end
-
end
-
end
-
1
class Rufus::Scheduler::SchedulerCore
-
1
def every_day(start_time)
-
3
time = start_time.in_time_zone
-
3
cron "#{time.min} #{time.hour} * * * #{ActiveSupport::TimeZone::MAPPING[Time.zone.name]}" do
-
yield
-
end
-
end
-
-
1
def every_week(weekday, start_time)
-
99
time = start_time.in_time_zone
-
99
weekday_to_cron_number = {sunday: 0, monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6}.with_indifferent_access
-
99
cron "#{time.min} #{time.hour} * * #{weekday_to_cron_number[weekday]} #{ActiveSupport::TimeZone::MAPPING[Time.zone.name]}" do
-
yield
-
end
-
end
-
-
1
def schedule_job_for_time_period_every(interval, start_time, end_time)
-
start_time_with_zone = start_time.in_time_zone
-
end_time_with_zone = end_time.in_time_zone
-
j = nil
-
-
time_today = Time.current
-
duration = end_time_with_zone - start_time_with_zone
-
start_time_today = Time.zone.parse(start_time_with_zone.strftime('%H:%M:%S %Z'))
-
-
# if current time is in an interval starting today OR current time is in an interval starting yesterday and ending today
-
if (start_time_today..start_time_today + duration).cover?(time_today) || (start_time_today - 24.hours..start_time_today - (24.hours - duration)).cover?(time_today)
-
j = every interval do
-
yield
-
end
-
end
-
-
every_day start_time_with_zone do
-
j = every interval do
-
yield
-
end
-
end
-
-
every_day end_time_with_zone do
-
j.try(:unschedule)
-
end
-
end
-
end
-
1
module Sidekiq
-
-
1
class Client
-
1
def self.convert_to_staging_queue(queue)
-
11
"#{queue}_staging"
-
end
-
-
1
def self.push_bulk_staged(items)
-
5
normed = normalize_item(items)
-
5
normed['queue'] = convert_to_staging_queue normed['queue']
-
5
payloads = items['args'].map do |args|
-
14
raise ArgumentError, 'Bulk arguments must be an Array of Arrays: [[1], [2]]' if !args.is_a?(Array)
-
14
process_single(items['class'], normed.merge('args' => args, 'jid' => SecureRandom.hex(12), 'enqueued_at' => Time.now.to_f))
-
end.compact
-
-
5
pushed = false
-
5
pushed = raw_push(payloads) if !payloads.empty?
-
5
pushed ? payloads.size : nil
-
end
-
end
-
-
1
module Worker
-
# if socks error from tor client, just requeue the job and skip the rest by passing false
-
# if throttled, requeue the job, sleep for desired time (if using the proxy) until you want to retry on this port
-
# or if using the instance IP then set the dyno as throttled so future jobs know to immediately use the proxy
-
1
def handle_captcha(page, sleep_time, *args)
-
59
if page.socks_error?
-
self.class.perform_async *args
-
false
-
58
elsif page.ok? && page.captcha? || page.response_code == '503'
-
self.class.perform_async *args
-
-
if ProxyUtilities.using_proxy?
-
sleep(sleep_time)
-
else
-
RedisUtilities.set_dyno_throttled
-
end
-
-
false
-
else
-
58
page.ok?
-
end
-
end
-
-
# BN Specific Version
-
# if socks error from tor client, just requeue the job and skip the rest by passing false
-
# if throttled (previously 429, seems to be 500 now), requeue the job, sleep for desired time (if using the proxy) until you want to retry on this port
-
# or if using the instance IP then set the dyno as throttled so future jobs know to immediately use the proxy
-
1
def bn_captcha_sleepy_time(page, sleep_time, *args)
-
3
if page.socks_error?
-
self.class.perform_async *args
-
false
-
3
elsif page.net_persistent_error? || page.response_code == '429' || page.response_code == '500'
-
self.class.perform_async *args
-
-
if ProxyUtilities.using_proxy?
-
sleep(sleep_time)
-
else
-
RedisUtilities.set_dyno_throttled
-
end
-
-
false
-
else
-
3
page.ok?
-
end
-
end
-
end
-
end
-
1
require "net/http"
-
-
1
module HttpHelper
-
1
extend self
-
-
1
def get_html(url)
-
2
Nokogiri::HTML(fetch(url))
-
rescue *HTTP_ERRORS
-
1
nil
-
end
-
-
1
def get_json(url)
-
9
JSON.parse(fetch(url))
-
rescue JSON::ParserError, *HTTP_ERRORS => e
-
1
nil
-
end
-
-
1
def get_bn_search_page_html(url)
-
1
get_html_with_mechanize url, 'Mac FireFox'
-
end
-
-
1
def get_bn_list_stat_page_html(url)
-
10
get_html_with_mechanize url, 'Windows IE 6'
-
end
-
-
1
def get_bn_category_page_html(url, user_agent)
-
3
count = 0
-
3
valid_page = nil
-
-
# Loop 5 times on each page to get a result otherwise give up
-
3
while count < 5 do
-
8
html = get_html_with_mechanize url, user_agent
-
8
unless ScraperUtilities.bn_no_results? html
-
2
valid_page = html
-
2
break
-
end
-
6
count += 1
-
end
-
-
3
valid_page
-
end
-
-
1
def get_html_with_mechanize_no_rescue(url, user_agent_string)
-
187
agent = Mechanize.new
-
187
agent.user_agent_alias = user_agent_string
-
187
agent.idle_timeout = 5 if url.include? 'barnesandnoble.com'
-
187
page_or_file = agent.get(url)
-
#TODO: Convert all uses to Mechanize::Page, seems to be what is happening anyway?
-
186
page_or_file.class == Mechanize::Page ? page_or_file.parser : Nokogiri::HTML(page_or_file.body)
-
end
-
-
1
def get_html_with_mechanize(url, user_agent_string)
-
13
get_html_with_mechanize_no_rescue url, user_agent_string
-
rescue *HTTP_ERRORS => e
-
if e.class == Mechanize::ResponseCodeError
-
error = {error_class: Mechanize::ResponseCodeError.to_s, code: e.response_code}
-
else
-
error = {error_class: e.class.to_s}
-
end
-
Utilities.log('http_error', error)
-
{}
-
end
-
-
1
def get_canonical_amazon_url(asin, tld)
-
4
uri_str = Urls.amazon_book_page(asin, tld)
-
4
response = Net::HTTP.get_response(URI.parse(URI.encode(uri_str)))
-
3
case response
-
when Net::HTTPSuccess then
-
2
Rails.logger.tagged('cleanup') {Rails.logger.info "Basic URL #{uri_str} is valid no redirect"}
-
1
uri_str
-
when Net::HTTPRedirection then
-
1
if response['location'].present?
-
2
Rails.logger.tagged('cleanup') {Rails.logger.info "Basic URL redirects to: #{response['location']}"}
-
1
response['location']
-
end
-
else
-
nil
-
end
-
rescue *HTTP_ERRORS
-
1
nil
-
end
-
-
1
def get_canonical_bn_url(isbn13, tld)
-
5
return nil unless isbn13.present? && tld == '.com'
-
-
3
agent = Mechanize.new
-
3
agent.user_agent_alias='Windows IE 6'
-
3
agent.get(Urls.bn_book_page(isbn13))
-
2
canonical_url = agent.history.last.uri.to_s
-
2
canonical_url if canonical_url.exclude?('noresults')
-
rescue *HTTP_ERRORS
-
1
nil
-
end
-
-
1
def get_canonical_goodread_url(key)
-
5
uri_str = Urls.goodreads_book_page(key)
-
5
response = Net::HTTP.get_response(URI.parse(URI.encode(uri_str)))
-
4
case response
-
when Net::HTTPSuccess then
-
2
Rails.logger.tagged('cleanup') {Rails.logger.info "Basic URL #{uri_str} is valid no redirect"}
-
1
uri_str
-
when Net::HTTPRedirection then
-
# 'http://www.goodreads.com/book' is not a valid url, this means there is no page for this key
-
2
if response['location'].present? && response['location'] != 'http://www.goodreads.com/book'
-
2
Rails.logger.tagged('cleanup') {Rails.logger.info "Basic URL redirects to: #{response['location']}"}
-
1
response['location']
-
end
-
else
-
nil
-
end
-
rescue *HTTP_ERRORS
-
1
nil
-
end
-
-
1
private
-
-
1
def fetch(uri_str, limit = 5)
-
# You should choose a better exception.
-
7
raise ArgumentError, 'too many HTTP redirects' if limit == 0
-
-
7
response = Net::HTTP.get_response(URI.parse(URI.encode(uri_str)))
-
-
7
case response
-
when Net::HTTPSuccess then
-
7
body = response.body
-
7
if body.encoding != 'UTF-8'
-
7
original_encoding = body.encoding
-
7
begin
-
7
body = body.encode 'UTF-8'
-
rescue Encoding::UndefinedConversionError, Encoding::ConverterNotFoundError
-
3
body.force_encoding 'ISO-8859-1'
-
3
begin
-
3
body = body.encode 'UTF-8'
-
rescue Encoding::UndefinedConversionError, Encoding::ConverterNotFoundError
-
body.force_encoding original_encoding
-
end
-
end
-
end
-
7
body
-
when Net::HTTPRedirection then
-
response['location'].present? ? fetch(response['location'], limit - 1) : (raise ArgumentError, 'Blank redirect!')
-
else
-
response.value
-
end
-
end
-
end
-
1
class JsonColumn
-
1
def self.default_with(&block)
-
new(block)
-
end
-
-
1
def initialize(default)
-
@default = default
-
end
-
-
1
def dump(obj)
-
MultiJson.dump(obj) unless obj.nil?
-
end
-
-
1
def load(json)
-
return @default.call if json.nil?
-
obj = MultiJson.load(json)
-
obj.is_a?(Hash) ? obj.with_indifferent_access : obj
-
end
-
end
-
1
require 'sidekiq'
-
-
1
module HerokuScaler
-
1
extend self
-
-
1
@conditionally_log = nil
-
-
1
def set_conditionally_log
-
1
@conditionally_log = Utilities.is_flag_set? 'conditional_log'
-
end
-
-
1
def scale!(configurations = [])
-
3
if ENV['HEROKU_API_KEY'] && ENV['HEROKU_APP']
-
3
heroku = PlatformAPI.connect(ENV['HEROKU_API_KEY'])
-
-
3
process_counts = get_process_counts(heroku)
-
3
sidekiq_job_counts = get_sidekiq_job_counts
-
6
Rails.logger.tagged('scaler') {Rails.logger.info "process_counts: #{process_counts}, sidekiq_job_counts: #{sidekiq_job_counts}"} if @conditionally_log
-
3
configurations.each do |config|
-
3
process_count = process_counts[config[:name]]
-
9
total_job_count = config[:queues].sum {|queue| sidekiq_job_counts[queue]}
-
3
needed_count = process_count_needed(total_job_count, config[:concurrency], config[:maximum_count])
-
-
# Scale up only 50 max per 30 seconds
-
3
if process_count > config[:minimum_count] && total_job_count == 0
-
2
new_count = [process_count - 50, config[:minimum_count]].max
-
4
Rails.logger.tagged('scaler') {Rails.logger.info "Scaling #{config[:name]} down to #{new_count} from #{process_count}"}
-
2
scale_process! config[:name], new_count, heroku
-
elsif needed_count > process_count
-
1
new_count = [process_count + 50, needed_count].min
-
2
Rails.logger.tagged('scaler') {Rails.logger.info "Scaling #{config[:name]} up to #{new_count}, current count:#{process_count} total_job_count:#{total_job_count}"}
-
1
scale_process! config[:name], new_count, heroku
-
end
-
-
6
Rails.logger.tagged('scaler') {Rails.logger.info "process_count: #{process_count}, total_job_count: #{total_job_count}, if condition: #{process_count > config[:minimum_count] && total_job_count == 0}, elsif condition: #{process_count < process_count_needed(total_job_count, config[:concurrency], config[:maximum_count])}, config: #{config}"} if @conditionally_log
-
end
-
end
-
end
-
-
1
def get_process_counts(heroku = PlatformAPI.connect(ENV['HEROKU_API_KEY']), first_try = true)
-
1
process_counts = heroku.formation.list(ENV['HEROKU_APP']).each_with_object(HashWithIndifferentAccess.new(0)) do |process_details, hash|
-
2
hash[process_details['type']] = process_details['quantity']
-
end
-
2
Rails.logger.tagged('scaler') {Rails.logger.info process_counts.inspect} if @conditionally_log
-
1
process_counts
-
rescue Excon::Errors::Error => e
-
if first_try
-
Rails.logger.tagged('scaler') {Rails.logger.info "Retrying process counts because of error: #{e}"} if @conditionally_log
-
get_process_counts(heroku, false)
-
else
-
Rails.logger.tagged('scaler') {Rails.logger.info "Not retrying process counts, second error: #{e}"} if @conditionally_log
-
end
-
end
-
-
1
def get_sidekiq_job_counts
-
3
counts = Sidekiq::Queue.all.each_with_object(HashWithIndifferentAccess.new(0)) {|queue, hash| hash[queue.name] = queue.size}
-
1
Sidekiq.redis do |conn|
-
1
conn.smembers('workers').each do |w|
-
3
msg = conn.get("worker:#{w}")
-
3
counts[Sidekiq.load_json(msg)['queue']] += 1 if msg.present?
-
end
-
end
-
-
1
counts
-
end
-
-
1
def process_count_needed(pending_job_count, concurrency, maximum_count)
-
8
[(pending_job_count.to_f / concurrency).ceil, maximum_count].min
-
end
-
-
1
def scale_process!(process_name, quantity, heroku = PlatformAPI.connect(ENV['HEROKU_API_KEY']), first_try = true)
-
3
heroku.formation.update(ENV['HEROKU_APP'], process_name, {'quantity' => quantity})
-
rescue Excon::Errors::Error => e
-
2
if first_try
-
2
Rails.logger.tagged('scaler') {Rails.logger.info "Retrying scale because of error: #{e}"} if @conditionally_log
-
1
scale_process!(process_name, quantity, heroku, false)
-
else
-
2
Rails.logger.tagged('scaler') {Rails.logger.info "Not retrying scale, second error: #{e}"} if @conditionally_log
-
end
-
end
-
end
-
1
require 'json'
-
-
1
class SmtpApiHeader
-
1
def initialize(to = nil)
-
14
@data = {}
-
14
add_to to if to.present?
-
end
-
-
1
def add_to(to)
-
12
@data['to'] ||= []
-
12
@data['to'] += to.kind_of?(Array) ? to : [to]
-
end
-
-
1
def as_json()
-
12
json = JSON.generate @data
-
12
json.gsub(/(["\]}])([,:])(["\[{])/, '\\1\\2 \\3')
-
end
-
end
-
1
module Urls
-
1
extend self
-
-
1
def amazon_book_category_page(category_id, tld, base_category, page_number)
-
15
url = "http://www.amazon#{tld}/Best-Sellers/zgbs/#{base_category == 'Books' ? 'books' : 'digital-text'}/"
-
15
url += "#{category_id}/" if category_id.present?
-
15
url += 'ref=zg_bs_'
-
15
url += "#{"#{category_id}_" if category_id.present?}"
-
15
url + "pg_#{page_number}?_encoding=UTF8&pg=#{page_number}"
-
end
-
-
1
def amazon_book_page(asin, tld)
-
115
"http://www.amazon#{tld}/gp/product/#{asin}"
-
end
-
-
1
def amazon_author_page(author_asin, tld)
-
2
"http://www.amazon#{tld}/a/e/#{author_asin}"
-
end
-
-
1
def amazon_search_page(isbn_or_asin, tld)
-
# field-asin parameter only works with search-alias parameter set
-
2
isbn_or_asin.scan(/\D/).present? ? "http://www.amazon#{tld}/gp/search/?search-alias=stripbooks&field-asin=#{isbn_or_asin}" : "http://www.amazon#{tld}/gp/search/?&field-isbn=#{isbn_or_asin}"
-
end
-
-
# We default this to a max of 10, can make it configuarable if we want
-
1
def amazon_related_format_url(parent_asin, dom_format, tld)
-
7
"http://www.amazon#{tld}/gp/media-matrix/fetch-expansion-data.html?metaBinding=#{dom_format}_meta_binding&productCategory=books&parentAsin=#{parent_asin}&startIndex=1&count=10"
-
end
-
-
1
def apple_book_category_feed(category_id, type)
-
12
raise NotImplementedError unless %w[paid free].include? type.to_s
-
-
11
"https://itunes.apple.com/us/rss/top#{type}ebooks/limit=200/genre=#{category_id}/xml"
-
end
-
-
1
def bn_book_page(ean)
-
21
"http://www.barnesandnoble.com/w/a/?ean=#{ean}"
-
end
-
-
1
def bn_search_page(isbn13)
-
"http://www.barnesandnoble.com/s/#{isbn13}"
-
end
-
-
# Escape the string and append a search param to get specific book types in search results
-
1
def bn_search_page_by_details(title, author_name, book_format)
-
7
if book_format.include? 'Paperback'
-
1
search_param = '?aref=1521'
-
elsif book_format.include? 'Hardcover'
-
1
search_param = '?aref=1519'
-
elsif book_format.include? 'Kindle'
-
5
search_param = '?dref=2207'
-
else
-
search_param = '?dref=1'
-
end
-
-
7
"http://www.barnesandnoble.com/s/#{CGI.escape("#{title.gsub(':','')} #{author_name}")}#{search_param}&view=list"
-
end
-
-
1
def bn_category_page(category_id, book_format, page_number)
-
4
url_params = [("aref=#{Utilities.bn_format_code(book_format)}" if Utilities.bn_format_code(book_format).present?),
-
"dref=#{category_id}",
-
4
("size=90&startat=#{(page_number - 1) * 90 + 1}" if page_number.present?),
-
4
('view=grid' if page_number.present? && book_format.present?)]
-
-
4
"http://www.barnesandnoble.com/s/?#{url_params.compact.join('&')}"
-
end
-
-
1
def goodreads_book_page(key)
-
7
"http://www.goodreads.com/book/isbn/#{key}"
-
end
-
end