catstodon/lib/mastodon/snowflake.rb
ThibG a24605961a Fixes/do not override timestamps (#7336)
* Revert "Fixes/do not override timestamps (#7331)"

This reverts commit 581a5c9d29.

* Document Snowflake ID corner-case a bit more

Snowflake IDs are used for two purposes: making object identifiers harder to
guess and ensuring they are in chronological order. For this reason, they
are based on the `created_at` attribute of the object.

Unfortunately, inserting items with older snowflakes IDs will break the
assumption of consumers of the paging APIs that new items will always have
a greater identifier than the last seen one.

* Add `override_timestamps` virtual attribute to not correlate snowflake ID with created_at
2018-05-03 23:02:46 +02:00

162 lines
5.1 KiB
Ruby

# frozen_string_literal: true
module Mastodon::Snowflake
DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/
class Callbacks
def self.around_create(record)
now = Time.now.utc
if record.created_at.nil? || record.created_at >= now || record.created_at == record.updated_at || record.override_timestamps
yield
else
record.id = Mastodon::Snowflake.id_at(record.created_at)
tries = 0
begin
yield
rescue ActiveRecord::RecordNotUnique
raise if tries > 100
tries += 1
record.id += rand(100)
retry
end
end
end
end
class << self
# Our ID will be composed of the following:
# 6 bytes (48 bits) of millisecond-level timestamp
# 2 bytes (16 bits) of sequence data
#
# The 'sequence data' is intended to be unique within a
# given millisecond, yet obscure the 'serial number' of
# this row.
#
# To do this, we hash the following data:
# * Table name (if provided, skipped if not)
# * Secret salt (should not be guessable)
# * Timestamp (again, millisecond-level granularity)
#
# We then take the first two bytes of that value, and add
# the lowest two bytes of the table ID sequence number
# (`table_name`_id_seq). This means that even if we insert
# two rows at the same millisecond, they will have
# distinct 'sequence data' portions.
#
# If this happens, and an attacker can see both such IDs,
# they can determine which of the two entries was inserted
# first, but not the total number of entries in the table
# (even mod 2**16).
#
# The table name is included in the hash to ensure that
# different tables derive separate sequence bases so rows
# inserted in the same millisecond in different tables do
# not reveal the table ID sequence number for one another.
#
# The secret salt is included in the hash to ensure that
# external users cannot derive the sequence base given the
# timestamp and table name, which would allow them to
# compute the table ID sequence number.
def define_timestamp_id
return if already_defined?
connection.execute(<<~SQL)
CREATE OR REPLACE FUNCTION timestamp_id(table_name text)
RETURNS bigint AS
$$
DECLARE
time_part bigint;
sequence_base bigint;
tail bigint;
BEGIN
time_part := (
-- Get the time in milliseconds
((date_part('epoch', now()) * 1000))::bigint
-- And shift it over two bytes
<< 16);
sequence_base := (
'x' ||
-- Take the first two bytes (four hex characters)
substr(
-- Of the MD5 hash of the data we documented
md5(table_name ||
'#{SecureRandom.hex(16)}' ||
time_part::text
),
1, 4
)
-- And turn it into a bigint
)::bit(16)::bigint;
-- Finally, add our sequence number to our base, and chop
-- it to the last two bytes
tail := (
(sequence_base + nextval(table_name || '_id_seq'))
& 65535);
-- Return the time part and the sequence part. OR appears
-- faster here than addition, but they're equivalent:
-- time_part has no trailing two bytes, and tail is only
-- the last two bytes.
RETURN time_part | tail;
END
$$ LANGUAGE plpgsql VOLATILE;
SQL
end
def ensure_id_sequences_exist
# Find tables using timestamp IDs.
connection.tables.each do |table|
# We're only concerned with "id" columns.
next unless (id_col = connection.columns(table).find { |col| col.name == 'id' })
# And only those that are using timestamp_id.
next unless (data = DEFAULT_REGEX.match(id_col.default_function))
seq_name = data[:seq_prefix] + '_id_seq'
# If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF
# NOT EXISTS, but we can't depend on that. Instead, catch the
# possible exception and ignore it.
# Note that seq_name isn't a column name, but it's a
# relation, like a column, and follows the same quoting rules
# in Postgres.
connection.execute(<<~SQL)
DO $$
BEGIN
CREATE SEQUENCE #{connection.quote_column_name(seq_name)};
EXCEPTION WHEN duplicate_table THEN
-- Do nothing, we have the sequence already.
END
$$ LANGUAGE plpgsql;
SQL
end
end
def id_at(timestamp)
id = timestamp.to_i * 1000 + rand(1000)
id = id << 16
id += rand(2**16)
id
end
private
def already_defined?
connection.execute(<<~SQL).values.first.first
SELECT EXISTS(
SELECT * FROM pg_proc WHERE proname = 'timestamp_id'
);
SQL
end
def connection
ActiveRecord::Base.connection
end
end
end