Adding a scaled sort, to boost smaller communities. (#3907)

* Adding a scaled sort, to boost smaller communities. - Previously referred to as *best* . - Fixes #3622 * Fixing scheduled task update. * Converting hot_rank integers to floats. * Altering hot_rank psql function to default to zero after a week. * Setting scaled_rank to zero, where hot_rank is zero. * Adding image_upload table.
2024-11-21 14:17:08 +00:00 · 2023-09-06 13:43:27 -04:00 · 2023-09-06 13:43:27 -04:00 · 9785b20843
parent 4121fc4d56
commit 9785b20843
13 changed files with 279 additions and 31 deletions
--- a/crates/apub/src/activities/create_or_update/post.rs
+++ b/crates/apub/src/activities/create_or_update/post.rs
@ -150,7 +150,7 @@ impl ActivityHandler for CreateOrUpdatePage {
    PostLike::like(&mut context.pool(), &like_form).await?;
    // Calculate initial hot_rank for post
-    PostAggregates::update_hot_rank(&mut context.pool(), post.id).await?;
+    PostAggregates::update_ranks(&mut context.pool(), post.id).await?;
    Ok(())
  }
--- a/crates/db_schema/src/aggregates/post_aggregates.rs
+++ b/crates/db_schema/src/aggregates/post_aggregates.rs
@ -1,10 +1,14 @@
 use crate::{
  aggregates::structs::PostAggregates,
  newtypes::PostId,
-  schema::post_aggregates,
+  schema::{community_aggregates, post, post_aggregates},
-  utils::{functions::hot_rank, get_conn, DbPool},
+  utils::{
    functions::{hot_rank, scaled_rank},
    get_conn,
    DbPool,
  },
 };
-use diesel::{result::Error, ExpressionMethods, QueryDsl};
+use diesel::{result::Error, ExpressionMethods, JoinOnDsl, QueryDsl};
 use diesel_async::RunQueryDsl;
 impl PostAggregates {
@ -16,9 +20,19 @@ impl PostAggregates {
      .await
  }
-  pub async fn update_hot_rank(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
+  pub async fn update_ranks(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
    let conn = &mut get_conn(pool).await?;
    // Diesel can't update based on a join, which is necessary for the scaled_rank
    // https://github.com/diesel-rs/diesel/issues/1478
    // Just select the users_active_month manually for now, since its a single post anyway
    let users_active_month = community_aggregates::table
      .select(community_aggregates::users_active_month)
      .inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id)))
      .filter(post::id.eq(post_id))
      .first::<i64>(conn)
      .await?;
    diesel::update(post_aggregates::table)
      .filter(post_aggregates::post_id.eq(post_id))
      .set((
@ -27,6 +41,11 @@ impl PostAggregates {
          post_aggregates::score,
          post_aggregates::newest_comment_time_necro,
        )),
        post_aggregates::scaled_rank.eq(scaled_rank(
          post_aggregates::score,
          post_aggregates::published,
          users_active_month,
        )),
      ))
      .get_result::<Self>(conn)
      .await
--- a/crates/db_schema/src/aggregates/structs.rs
+++ b/crates/db_schema/src/aggregates/structs.rs
@ -27,11 +27,11 @@ pub struct CommentAggregates {
  pub published: DateTime<Utc>,
  /// The total number of children in this comment branch.
  pub child_count: i32,
-  pub hot_rank: i32,
+  pub hot_rank: f64,
  pub controversy_rank: f64,
 }
-#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
+#[derive(PartialEq, Debug, Serialize, Deserialize, Clone)]
 #[cfg_attr(feature = "full", derive(Queryable, Associations, Identifiable, TS))]
 #[cfg_attr(feature = "full", diesel(table_name = community_aggregates))]
 #[cfg_attr(
@ -55,7 +55,7 @@ pub struct CommunityAggregates {
  pub users_active_month: i64,
  /// The number of users with any activity in the last year.
  pub users_active_half_year: i64,
-  pub hot_rank: i32,
+  pub hot_rank: f64,
 }
 #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)]
@ -95,11 +95,13 @@ pub struct PostAggregates {
  pub featured_community: bool,
  /// If the post is featured on the site / to local.
  pub featured_local: bool,
-  pub hot_rank: i32,
+  pub hot_rank: f64,
-  pub hot_rank_active: i32,
+  pub hot_rank_active: f64,
  pub community_id: CommunityId,
  pub creator_id: PersonId,
  pub controversy_rank: f64,
  /// A rank that amplifies smaller communities
  pub scaled_rank: f64,
 }
 #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
--- a/crates/db_schema/src/lib.rs
+++ b/crates/db_schema/src/lib.rs
@ -54,6 +54,7 @@ use ts_rs::TS;
 )]
 #[cfg_attr(feature = "full", DbValueStyle = "verbatim")]
 #[cfg_attr(feature = "full", ts(export))]
 // TODO add the controversial and scaled rankings to the doc below
 /// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html
 pub enum SortType {
  #[default]
@ -75,6 +76,7 @@ pub enum SortType {
  TopSixMonths,
  TopNineMonths,
  Controversial,
  Scaled,
 }
 #[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)]
--- a/crates/db_schema/src/schema.rs
+++ b/crates/db_schema/src/schema.rs
@ -100,7 +100,7 @@ diesel::table! {
        downvotes -> Int8,
        published -> Timestamptz,
        child_count -> Int4,
-        hot_rank -> Int4,
+        hot_rank -> Float8,
        controversy_rank -> Float8,
    }
 }
@ -198,7 +198,7 @@ diesel::table! {
        users_active_week -> Int8,
        users_active_month -> Int8,
        users_active_half_year -> Int8,
-        hot_rank -> Int4,
+        hot_rank -> Float8,
    }
 }
@ -299,6 +299,16 @@ diesel::table! {
    }
 }
 diesel::table! {
    image_upload (id) {
        id -> Int4,
        local_user_id -> Int4,
        pictrs_alias -> Text,
        pictrs_delete_token -> Text,
        published -> Timestamptz,
    }
 }
 diesel::table! {
    instance (id) {
        id -> Int4,
@ -683,11 +693,12 @@ diesel::table! {
        newest_comment_time -> Timestamptz,
        featured_community -> Bool,
        featured_local -> Bool,
-        hot_rank -> Int4,
+        hot_rank -> Float8,
-        hot_rank_active -> Int4,
+        hot_rank_active -> Float8,
        community_id -> Int4,
        creator_id -> Int4,
        controversy_rank -> Float8,
        scaled_rank -> Float8,
    }
 }
@ -893,6 +904,7 @@ diesel::joinable!(custom_emoji_keyword -> custom_emoji (custom_emoji_id));
 diesel::joinable!(email_verification -> local_user (local_user_id));
 diesel::joinable!(federation_allowlist -> instance (instance_id));
 diesel::joinable!(federation_blocklist -> instance (instance_id));
 diesel::joinable!(image_upload -> local_user (local_user_id));
 diesel::joinable!(local_site -> site (site_id));
 diesel::joinable!(local_site_rate_limit -> local_site (local_site_id));
 diesel::joinable!(local_user -> person (person_id));
@ -967,6 +979,7 @@ diesel::allow_tables_to_appear_in_same_query!(
    email_verification,
    federation_allowlist,
    federation_blocklist,
    image_upload,
    instance,
    language,
    local_site,
--- a/crates/db_schema/src/utils.rs
+++ b/crates/db_schema/src/utils.rs
@ -347,7 +347,7 @@ pub fn naive_now() -> DateTime<Utc> {
 pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType {
  match sort {
-    SortType::Active | SortType::Hot => CommentSortType::Hot,
+    SortType::Active | SortType::Hot | SortType::Scaled => CommentSortType::Hot,
    SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New,
    SortType::Old => CommentSortType::Old,
    SortType::Controversial => CommentSortType::Controversial,
@ -384,7 +384,11 @@ pub mod functions {
  use diesel::sql_types::{BigInt, Text, Timestamptz};
  sql_function! {
-    fn hot_rank(score: BigInt, time: Timestamptz) -> Integer;
+    fn hot_rank(score: BigInt, time: Timestamptz) -> Double;
  }
  sql_function! {
    fn scaled_rank(score: BigInt, time: Timestamptz, users_active_month: BigInt) -> Double;
  }
  sql_function! {
--- a/crates/db_views/src/comment_report_view.rs
+++ b/crates/db_views/src/comment_report_view.rs
@ -432,7 +432,7 @@ mod tests {
        downvotes: 0,
        published: agg.published,
        child_count: 0,
-        hot_rank: 1728,
+        hot_rank: 0.1728,
        controversy_rank: 0.0,
      },
      my_vote: None,
--- a/crates/db_views/src/comment_view.rs
+++ b/crates/db_views/src/comment_view.rs
@ -886,7 +886,7 @@ mod tests {
        downvotes: 0,
        published: agg.published,
        child_count: 5,
-        hot_rank: 1728,
+        hot_rank: 0.1728,
        controversy_rank: 0.0,
      },
    }
--- a/crates/db_views/src/post_view.rs
+++ b/crates/db_views/src/post_view.rs
@ -380,6 +380,9 @@ fn queries<'a>() -> Queries<
      SortType::Hot => query
        .then_order_by(post_aggregates::hot_rank.desc())
        .then_order_by(post_aggregates::published.desc()),
      SortType::Scaled => query
        .then_order_by(post_aggregates::scaled_rank.desc())
        .then_order_by(post_aggregates::published.desc()),
      SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()),
      SortType::New => query.then_order_by(post_aggregates::published.desc()),
      SortType::Old => query.then_order_by(post_aggregates::published.asc()),
@ -1154,9 +1157,10 @@ mod tests {
        newest_comment_time: inserted_post.published,
        featured_community: false,
        featured_local: false,
-        hot_rank: 1728,
+        hot_rank: 0.1728,
-        hot_rank_active: 1728,
+        hot_rank_active: 0.1728,
        controversy_rank: 0.0,
        scaled_rank: 0.3621,
        community_id: inserted_post.community_id,
        creator_id: inserted_post.creator_id,
      },
--- a/crates/db_views_actor/src/community_view.rs
+++ b/crates/db_views_actor/src/community_view.rs
@ -105,7 +105,7 @@ fn queries<'a>() -> Queries<
    }
    match options.sort.unwrap_or(Hot) {
-      Hot | Active => query = query.order_by(community_aggregates::hot_rank.desc()),
+      Hot | Active | Scaled => query = query.order_by(community_aggregates::hot_rank.desc()),
      NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => {
        query = query.order_by(community_aggregates::users_active_day.desc())
      }
--- a/migrations/2023-08-23-182533_scaled_rank/down.sql
+++ b/migrations/2023-08-23-182533_scaled_rank/down.sql
@ -0,0 +1,87 @@
 DROP FUNCTION scaled_rank;
 ALTER TABLE community_aggregates
    ALTER COLUMN hot_rank TYPE integer,
    ALTER COLUMN hot_rank SET DEFAULT 1728;
 ALTER TABLE comment_aggregates
    ALTER COLUMN hot_rank TYPE integer,
    ALTER COLUMN hot_rank SET DEFAULT 1728;
 ALTER TABLE post_aggregates
    ALTER COLUMN hot_rank TYPE integer,
    ALTER COLUMN hot_rank SET DEFAULT 1728,
    ALTER COLUMN hot_rank_active TYPE integer,
    ALTER COLUMN hot_rank_active SET DEFAULT 1728;
 -- Change back to integer version
 DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
 CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
    RETURNS integer
    AS $$
 DECLARE
    hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
 BEGIN
    IF (hours_diff > 0) THEN
        RETURN floor(10000 * log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8))::integer;
    ELSE
        -- if the post is from the future, set hot score to 0. otherwise you can game the post to
        -- always be on top even with only 1 vote by setting it to the future
        RETURN 0;
    END IF;
 END;
 $$
 LANGUAGE plpgsql
 IMMUTABLE PARALLEL SAFE;
 ALTER TABLE post_aggregates
    DROP COLUMN scaled_rank;
 -- The following code is necessary because postgres can't remove
 -- a single enum value.
 ALTER TABLE local_user
    ALTER default_sort_type DROP DEFAULT;
 UPDATE
    local_user
 SET
    default_sort_type = 'Hot'
 WHERE
    default_sort_type = 'Scaled';
 -- rename the old enum
 ALTER TYPE sort_type_enum RENAME TO sort_type_enum__;
 -- create the new enum
 CREATE TYPE sort_type_enum AS ENUM (
    'Active',
    'Hot',
    'New',
    'Old',
    'TopDay',
    'TopWeek',
    'TopMonth',
    'TopYear',
    'TopAll',
    'MostComments',
    'NewComments',
    'TopHour',
    'TopSixHour',
    'TopTwelveHour',
    'TopThreeMonths',
    'TopSixMonths',
    'TopNineMonths'
 );
 -- alter all your enum columns
 ALTER TABLE local_user
    ALTER COLUMN default_sort_type TYPE sort_type_enum
    USING default_sort_type::text::sort_type_enum;
 ALTER TABLE local_user
    ALTER default_sort_type SET DEFAULT 'Active';
 -- drop the old enum
 DROP TYPE sort_type_enum__;
--- a/migrations/2023-08-23-182533_scaled_rank/up.sql
+++ b/migrations/2023-08-23-182533_scaled_rank/up.sql
@ -0,0 +1,74 @@
 -- Change hot ranks and functions from an int to a float
 ALTER TABLE community_aggregates
    ALTER COLUMN hot_rank TYPE float,
    ALTER COLUMN hot_rank SET DEFAULT 0.1728;
 ALTER TABLE comment_aggregates
    ALTER COLUMN hot_rank TYPE float,
    ALTER COLUMN hot_rank SET DEFAULT 0.1728;
 ALTER TABLE post_aggregates
    ALTER COLUMN hot_rank TYPE float,
    ALTER COLUMN hot_rank SET DEFAULT 0.1728,
    ALTER COLUMN hot_rank_active TYPE float,
    ALTER COLUMN hot_rank_active SET DEFAULT 0.1728;
 DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
 CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
    RETURNS float
    AS $$
 DECLARE
    hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
 BEGIN
    -- 24 * 7 = 168, so after a week, it will default to 0.
    IF (hours_diff > 0 AND hours_diff < 168) THEN
        RETURN log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8);
    ELSE
        -- if the post is from the future, set hot score to 0. otherwise you can game the post to
        -- always be on top even with only 1 vote by setting it to the future
        RETURN 0.0;
    END IF;
 END;
 $$
 LANGUAGE plpgsql
 IMMUTABLE PARALLEL SAFE;
 -- The new scaled rank function
 CREATE OR REPLACE FUNCTION scaled_rank (score numeric, published timestamp with time zone, users_active_month numeric)
    RETURNS float
    AS $$
 BEGIN
    -- Add 2 to avoid divide by zero errors
    -- Default for score = 1, active users = 1, and now, is (0.1728 / log(2 + 1)) = 0.3621
    -- There may need to be a scale factor multiplied to users_active_month, to make
    -- the log curve less pronounced. This can be tuned in the future.
    RETURN (hot_rank (score, published) / log(2 + users_active_month));
 END;
 $$
 LANGUAGE plpgsql
 IMMUTABLE PARALLEL SAFE;
 ALTER TABLE post_aggregates
    ADD COLUMN scaled_rank float NOT NULL DEFAULT 0.3621;
 UPDATE
    post_aggregates
 SET
    scaled_rank = 0
 WHERE
    hot_rank = 0
    OR hot_rank_active = 0;
 CREATE INDEX idx_post_aggregates_featured_community_scaled ON post_aggregates (featured_community DESC, scaled_rank DESC, published DESC);
 CREATE INDEX idx_post_aggregates_featured_local_scaled ON post_aggregates (featured_local DESC, scaled_rank DESC, published DESC);
 -- We forgot to add the controversial sort type
 ALTER TYPE sort_type_enum
    ADD VALUE 'Controversial';
 -- Add the Scaled enum
 ALTER TYPE sort_type_enum
    ADD VALUE 'Scaled';
--- a/src/scheduled_tasks.rs
+++ b/src/scheduled_tasks.rs
@ -154,22 +154,16 @@ fn startup_jobs(db_url: &str) {
 fn update_hot_ranks(conn: &mut PgConnection) {
  info!("Updating hot ranks for all history...");
-  process_hot_ranks_in_batches(
+  process_post_aggregates_ranks_in_batches(conn);
    conn,
    "post_aggregates",
    "a.hot_rank != 0 OR a.hot_rank_active != 0",
    "SET hot_rank = hot_rank(a.score, a.published),
         hot_rank_active = hot_rank(a.score, a.newest_comment_time_necro)",
  );
-  process_hot_ranks_in_batches(
+  process_ranks_in_batches(
    conn,
    "comment_aggregates",
    "a.hot_rank != 0",
    "SET hot_rank = hot_rank(a.score, a.published)",
  );
-  process_hot_ranks_in_batches(
+  process_ranks_in_batches(
    conn,
    "community_aggregates",
    "a.hot_rank != 0",
@ -189,7 +183,7 @@ struct HotRanksUpdateResult {
 /// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table.
 /// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next
 /// run)
-fn process_hot_ranks_in_batches(
+fn process_ranks_in_batches(
  conn: &mut PgConnection,
  table_name: &str,
  where_clause: &str,
@ -241,6 +235,55 @@ fn process_hot_ranks_in_batches(
  );
 }
 /// Post aggregates is a special case, since it needs to join to the community_aggregates
 /// table, to get the active monthly user counts.
 fn process_post_aggregates_ranks_in_batches(conn: &mut PgConnection) {
  let process_start_time: DateTime<Utc> = Utc
    .timestamp_opt(0, 0)
    .single()
    .expect("0 timestamp creation");
  let update_batch_size = 1000; // Bigger batches than this tend to cause seq scans
  let mut processed_rows_count = 0;
  let mut previous_batch_result = Some(process_start_time);
  while let Some(previous_batch_last_published) = previous_batch_result {
    let result = sql_query(
      r#"WITH batch AS (SELECT pa.id
               FROM post_aggregates pa
               WHERE pa.published > $1
               AND (pa.hot_rank != 0 OR pa.hot_rank_active != 0)
               ORDER BY pa.published
               LIMIT $2
               FOR UPDATE SKIP LOCKED)
         UPDATE post_aggregates pa
           SET hot_rank = hot_rank(pa.score, pa.published),
           hot_rank_active = hot_rank(pa.score, pa.newest_comment_time_necro),
           scaled_rank = scaled_rank(pa.score, pa.published, ca.users_active_month)
         FROM batch, community_aggregates ca
         WHERE pa.id = batch.id and pa.community_id = ca.community_id RETURNING pa.published;
    "#,
    )
    .bind::<Timestamptz, _>(previous_batch_last_published)
    .bind::<Integer, _>(update_batch_size)
    .get_results::<HotRanksUpdateResult>(conn);
    match result {
      Ok(updated_rows) => {
        processed_rows_count += updated_rows.len();
        previous_batch_result = updated_rows.last().map(|row| row.published);
      }
      Err(e) => {
        error!("Failed to update {} hot_ranks: {}", "post_aggregates", e);
        break;
      }
    }
  }
  info!(
    "Finished process_hot_ranks_in_batches execution for {} (processed {} rows)",
    "post_aggregates", processed_rows_count
  );
 }
 fn delete_expired_captcha_answers(conn: &mut PgConnection) {
  diesel::delete(
    captcha_answer::table.filter(captcha_answer::published.lt(now() - IntervalDsl::minutes(10))),