Change tootctl search deploy algorithm (#14300)

This commit is contained in:
Eugen Rochko 2020-07-14 18:10:35 +02:00 committed by GitHub
parent 98b3b80d6b
commit 4abe3be321
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 132 additions and 21 deletions

View file

@ -31,7 +31,7 @@ class StatusesIndex < Chewy::Index
},
}
define_type ::Status.unscoped.kept.without_reblogs.includes(:media_attachments), delete_if: ->(status) { status.searchable_by.empty? } do
define_type ::Status.unscoped.kept.without_reblogs.includes(:media_attachments, :preloadable_poll) do
crutch :mentions do |collection|
data = ::Mention.where(status_id: collection.map(&:id)).where(account: Account.local, silent: false).pluck(:status_id, :account_id)
data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) }

View file

@ -7,6 +7,7 @@ ActiveRecord::Base.logger = dev_null
ActiveJob::Base.logger = dev_null
HttpLog.configuration.logger = dev_null
Paperclip.options[:log] = false
Chewy.logger = dev_null
module Mastodon
module CLIHelper

View file

@ -6,8 +6,19 @@ require_relative 'cli_helper'
module Mastodon
class SearchCLI < Thor
option :processes, default: 2, aliases: [:p]
desc 'deploy', 'Create or update an ElasticSearch index and populate it'
include CLIHelper
# Indices are sorted by amount of data to be expected in each, so that
# smaller indices can go online sooner
INDICES = [
AccountsIndex,
TagsIndex,
StatusesIndex,
].freeze
option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads'
option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices'
desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them'
long_desc <<~LONG_DESC
If ElasticSearch is empty, this command will create the necessary indices
and then import data from the database into those indices.
@ -15,27 +26,126 @@ module Mastodon
This command will also upgrade indices if the underlying schema has been
changed since the last run.
With the --processes option, parallelize execution of the command. The
default is 2. If "auto" is specified, the number is automatically
derived from available CPUs.
Even if creating or upgrading indices is not necessary, data from the
database will be imported into the indices.
LONG_DESC
def deploy
processed = Chewy::RakeHelper.upgrade(parallel: processes)
Chewy::RakeHelper.sync(except: processed, parallel: processes)
end
private
def processes
return true if options[:processes] == 'auto'
num = options[:processes].to_i
if num < 2
nil
else
num
if options[:concurrency] < 1
say('Cannot run with this concurrency setting, must be at least 1', :red)
exit(1)
end
indices = begin
if options[:only]
options[:only].map { |str| "#{str.camelize}Index".constantize }
else
INDICES
end
end
progress = ProgressBar.create(total: nil, format: '%t%c/%u |%b%i| %e (%r docs/s)', autofinish: false)
# First, ensure all indices are created and have the correct
# structure, so that live data can already be written
indices.select { |index| index.specification.changed? }.each do |index|
progress.title = "Upgrading #{index} "
index.purge
index.specification.lock!
end
ActiveRecord::Base.configurations[Rails.env]['pool'] = options[:concurrency] + 1
pool = Concurrent::FixedThreadPool.new(options[:concurrency])
added = Concurrent::AtomicFixnum.new(0)
removed = Concurrent::AtomicFixnum.new(0)
progress.title = 'Estimating workload '
# Estimate the amount of data that has to be imported first
indices.each do |index|
index.types.each do |type|
progress.total = (progress.total || 0) + type.adapter.default_scope.count
end
end
# Now import all the actual data. Mind that unlike chewy:sync, we don't
# fetch and compare all record IDs from the database and the index to
# find out which to add and which to remove from the index. Because with
# potentially millions of rows, the memory footprint of such a calculation
# is uneconomical. So we only ever add.
indices.each do |index|
progress.title = "Importing #{index} "
batch_size = 1_000
slice_size = (batch_size / options[:concurrency]).ceil
index.types.each do |type|
type.adapter.default_scope.reorder(nil).find_in_batches(batch_size: batch_size) do |batch|
futures = []
batch.each_slice(slice_size) do |records|
futures << Concurrent::Future.execute(executor: pool) do
begin
if !progress.total.nil? && progress.progress + records.size > progress.total
# The number of items has changed between start and now,
# since there is no good way to predict the final count from
# here, just change the progress bar to an indeterminate one
progress.total = nil
end
grouped_records = nil
bulk_body = nil
index_count = 0
delete_count = 0
ActiveRecord::Base.connection_pool.with_connection do
grouped_records = type.adapter.send(:grouped_objects, records)
bulk_body = Chewy::Type::Import::BulkBuilder.new(type, grouped_records).bulk_body
end
index_count = grouped_records[:index].size if grouped_records.key?(:index)
delete_count = grouped_records[:delete].size if grouped_records.key?(:delete)
# The following is an optimization for statuses specifically, since
# we want to de-index statuses that cannot be searched by anybody,
# but can't use Chewy's delete_if logic because it doesn't use
# crutches and our searchable_by logic depends on them
if type == StatusesIndex::Status
bulk_body.map! do |entry|
if entry[:index] && entry.dig(:index, :data, 'searchable_by').blank?
index_count -= 1
delete_count += 1
{ delete: entry[:index].except(:data) }
else
entry
end
end
end
Chewy::Type::Import::BulkRequest.new(type).perform(bulk_body)
progress.progress += records.size
added.increment(index_count)
removed.increment(delete_count)
sleep 1
rescue => e
progress.log pastel.red("Error importing #{index}: #{e}")
end
end
end
futures.map(&:value)
end
end
end
progress.title = ''
progress.stop
say("Indexed #{added.value} records, de-indexed #{removed.value}", :green, true)
end
end
end