Make diff check work just like before

2024-12-13 06:08:20 +00:00 · 2024-11-18 17:14:55 -07:00 · 2024-11-18 17:14:55 -07:00 · 8390c0eb30
parent 2aead25bc6
commit 8390c0eb30
4 changed files with 34 additions and 270 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1361,6 +1361,19 @@ version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
 [[package]]
 name = "diffutils"
 version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8d7ce619b5c0e13f7543dc2c203a7e6fa37e0111d876339aada7ec9540a58d5"
 dependencies = [
 "chrono",
 "diff",
 "regex",
 "same-file",
 "unicode-width",
 ]
 [[package]]
 name = "digest"
 version = "0.10.7"
@ -2627,7 +2640,7 @@ dependencies = [
 "diesel-derive-newtype",
 "diesel_ltree",
 "diesel_migrations",
- "diff",
+ "diffutils",
 "futures-util",
 "i-love-jesus",
 "lemmy_utils",
--- a/crates/db_schema/Cargo.toml
+++ b/crates/db_schema/Cargo.toml
@ -86,4 +86,4 @@ tuplex = { workspace = true, optional = true }
 [dev-dependencies]
 serial_test = { workspace = true }
 pretty_assertions = { workspace = true }
-diff = "0.1.13"
+diffutils = "0.4.2"
--- a/crates/db_schema/src/schema_setup.rs
+++ b/crates/db_schema/src/schema_setup.rs
@ -60,7 +60,7 @@ const REPLACEABLE_SCHEMA_PATH: &str = "crates/db_schema/replaceable_schema";
 struct MigrationHarnessWrapper<'a> {
  conn: &'a mut PgConnection,
  #[cfg(test)]
-  enable_diff_check: bool,
+  diff_checked_migration_name: Option<String>,
 }
 impl<'a> MigrationHarnessWrapper<'a> {
@ -86,7 +86,7 @@ impl<'a> MigrationHarness<Pg> for MigrationHarnessWrapper<'a> {
    migration: &dyn Migration<Pg>,
  ) -> diesel::migration::Result<MigrationVersion<'static>> {
    #[cfg(test)]
-    if self.enable_diff_check {
+    if self.diff_checked_migration_name == Some(migration.name().to_string()) {
      let before = diff_check::get_dump();
      self.run_migration_inner(migration)?;
@ -271,7 +271,11 @@ fn run_selected_migrations(
  let mut wrapper = MigrationHarnessWrapper {
    conn,
    #[cfg(test)]
-    enable_diff_check: options.enable_diff_check,
+    diff_checked_migration_name: options
      .enable_diff_check
      .then(|| diesel::migration::MigrationSource::<Pg>::migrations(&migrations()))
      .transpose()?
      .and_then(|migrations| Some(migrations.last()?.name().to_string())),
  };
  if options.revert {
@ -325,7 +329,8 @@ mod tests {
    // Start with consistent state by dropping everything
    conn.batch_execute("DROP OWNED BY CURRENT_USER;")?;
-    // Run all migrations and check for mistakes in down.sql files
+    // Run all migrations, make sure the newest migration can be redone, and check the newest
    // down.sql file
    assert_eq!(
      run(o().run().enable_diff_check())?,
      ReplaceableSchemaRebuilt
@ -345,10 +350,8 @@ mod tests {
    );
    assert_eq!(run(o().run().limit(1))?, ReplaceableSchemaRebuilt);
    // Revert all migrations
    assert_eq!(run(o().revert())?, ReplaceableSchemaNotRebuilt);
    // This should throw an error saying to use lemmy_server instead of diesel CLI
    conn.batch_execute("DROP OWNED BY CURRENT_USER;")?;
    assert!(matches!(
      conn.run_pending_migrations(migrations()),
      Err(e) if e.to_string().contains("lemmy_server")
--- a/crates/db_schema/src/schema_setup/diff_check.rs
+++ b/crates/db_schema/src/schema_setup/diff_check.rs
@ -1,10 +1,6 @@
 #![expect(clippy::expect_used)]
 use lemmy_utils::settings::SETTINGS;
-use std::{
+use std::process::{Command, Stdio};
  borrow::Cow,
  collections::btree_set::{self, BTreeSet},
  process::{Command, Stdio},
 };
 // It's not possible to call `export_snapshot()` for each dump and run the dumps in parallel with
 // the `--snapshot` flag. Don't waste your time!!!!
@ -16,21 +12,12 @@ pub fn get_dump() -> String {
      // Specify database URL
      "--dbname",
      &db_url,
-      // Allow differences in row data and old fast tables
+      // Disable some things
      "--schema-only",
      "--exclude-table=comment_aggregates_fast",
      "--exclude-table=community_aggregates_fast",
      "--exclude-table=post_aggregates_fast",
      "--exclude-table=user_fast",
      // Ignore some things to reduce the amount of queries done by pg_dump
      "--no-owner",
      "--no-privileges",
      "--no-comments",
      "--no-publications",
      "--no-security-labels",
      "--no-subscriptions",
      "--no-table-access-method",
-      "--no-tablespaces",
+      "--schema-only",
      "--no-sync",
    ])
    .stderr(Stdio::inherit())
    .output()
@ -42,251 +29,12 @@ pub fn get_dump() -> String {
  String::from_utf8(output.stdout).expect("pg_dump output is not valid UTF-8 text")
 }
 const PATTERN_LEN: usize = 19;
 pub fn check_dump_diff(before: String, after: String, label: &str) {
-  // Performance optimization
+  if before != after {
-  if after == before {
+    let diff_bytes =
-    return;
+      diffutilslib::unified_diff(before.as_bytes(), after.as_bytes(), &Default::default());
-  }
+    let diff = String::from_utf8_lossy(&diff_bytes);
-  // Borrow checker requires this to be a separate variable
+    panic!("{label}\n\n{diff}");
  let normalized_chunk_vecs = [&before, &after]
    // Remove identical items
    .map(|dump| chunks(dump).collect::<BTreeSet<_>>())
    .differences()
    // Remove items without unwanted types of differences (if migrations are correct, then this
    // removes everything)
    .map(|chunks| chunks.map(|&i| normalize_chunk(i)).collect::<BTreeSet<_>>());
  let [only_in_before, only_in_after] = normalized_chunk_vecs // Imagine that this line doesn't exist
    .differences()
    .map(|chunks| chunks.map(|i| &**i).collect::<Vec<_>>());
  if only_in_before.is_empty() && only_in_after.is_empty() {
    return;
  }
  // Build the panic message
  let after_has_more = only_in_before.len() < only_in_after.len();
  let [chunks, mut other_chunks] = if after_has_more {
    [only_in_before, only_in_after]
  } else {
    [only_in_after, only_in_before]
  };
  let diffs = chunks
    .into_iter()
    .chain(std::iter::repeat(""))
    .map_while(|chunk| {
      let (most_similar_chunk_index, most_similar_chunk) = other_chunks
        .iter()
        .enumerate()
        .max_by_key(|(_, other_chunk)| {
          if sql_command_name(chunk) != sql_command_name(other_chunk) {
            0
          } else {
            similarity(chunk, other_chunk)
          }
        })?;
      let diff_lines = if after_has_more {
        diff::lines(most_similar_chunk, chunk)
      } else {
        diff::lines(chunk, most_similar_chunk)
      };
      other_chunks.swap_remove(most_similar_chunk_index);
      Some(
        diff_lines
          .into_iter()
          .flat_map(|line| match line {
            diff::Result::Left(s) => ["- ", s, "\n"],
            diff::Result::Right(s) => ["+ ", s, "\n"],
            diff::Result::Both(s, _) => ["  ", s, "\n"],
          })
          .chain(["\n"])
          .collect::<String>(),
      )
    });
  panic!(
    "{}",
    std::iter::once(format!("{label}\n\n"))
      .chain(diffs)
      .collect::<String>()
  );
 }
 trait Differences<T> {
  fn differences(&self) -> [btree_set::Difference<'_, T>; 2];
 }
 impl<T: Ord> Differences<T> for [BTreeSet<T>; 2] {
  /// Items only in `a`, and items only in `b`
  fn differences(&self) -> [btree_set::Difference<'_, T>; 2] {
    let [a, b] = self;
    [a.difference(b), b.difference(a)]
  }
 }
 fn sql_command_name(chunk: &str) -> &str {
  chunk
    .split_once(|c: char| c.is_lowercase())
    .unwrap_or_default()
    .0
 }
 fn similarity(chunk: &str, other_chunk: &str) -> usize {
  diff::chars(chunk, other_chunk)
    .into_iter()
    .filter(|i| {
      match i {
        diff::Result::Both(c, _) => {
          // Prevent whitespace from affecting similarity level
          !c.is_whitespace()
            && (
              // Increase accuracy for some trigger function diffs
              c.is_lowercase()
                  // Preserve differences in names that contain a number
                  || c.is_numeric()
            )
        }
        _ => false,
      }
    })
    .count()
 }
 fn normalize_chunk(chunk: &str) -> Cow<'_, str> {
  let mut chunk = Cow::Borrowed(chunk);
  let stripped_lines = chunk
    .lines()
    .map(|line| line.strip_suffix(',').unwrap_or(line));
  // Sort column names, so differences in column order are ignored
  if chunk.starts_with("CREATE TABLE ") {
    let mut lines = stripped_lines.collect::<Vec<_>>();
    sort_within_sections(&mut lines, |line| {
      match line.chars().next() {
        // CREATE
        Some('C') => 0,
        // Indented column name
        Some(' ') => 1,
        // End
        Some(')') => 2,
        _ => panic!("unrecognized part of `CREATE TABLE` statement: {line}"),
      }
    });
    chunk = Cow::Owned(lines.join("\n"));
  }
  // Replace timestamps with a constant string, so differences in timestamps are ignored
  for index in 0.. {
    // Performance optimization
    let Some(byte) = chunk.as_bytes().get(index) else {
      break;
    };
    if !byte.is_ascii_digit() {
      continue;
    }
    // Check for this pattern: 0000-00-00 00:00:00
    let Some((
      &[a0, a1, a2, a3, b0, a4, a5, b1, a6, a7, b2, a8, a9, b3, a10, a11, b4, a12, a13],
      remaining,
    )) = chunk
      .get(index..)
      .and_then(|s| s.as_bytes().split_first_chunk::<PATTERN_LEN>())
    else {
      break;
    };
    if [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13]
      .into_iter()
      .all(|byte| byte.is_ascii_digit())
      && [b0, b1, b2, b3, b4] == *b"-- ::"
    {
      // Replace the part of the string that has the checked pattern and an optional fractional part
      let len_after = if let Some((b'.', s)) = remaining.split_first() {
        1 + s.iter().position(|c| !c.is_ascii_digit()).unwrap_or(0)
      } else {
        0
      };
      // Length of replacement string is likely to match previous string
      // (there's up to 6 digits after the decimal point)
      chunk.to_mut().replace_range(
        index..(index + PATTERN_LEN + len_after),
        "AAAAAAAAAAAAAAAAAAAAAAAAAA",
      );
    }
  }
  chunk
 }
 fn sort_within_sections<T: Ord + ?Sized>(vec: &mut [&T], mut section: impl FnMut(&T) -> u8) {
  vec.sort_unstable_by_key(|&i| (section(i), i));
 }
 fn chunks(dump: &str) -> impl Iterator<Item = &str> {
  let mut remaining = dump;
  std::iter::from_fn(move || {
    remaining = remaining.trim_start();
    while let Some(s) = remove_skipped_item_from_beginning(remaining) {
      remaining = s.trim_start();
    }
    // `trim_start` guarantees that `result` is not empty
    let (result, after_result) = remaining.split_once("\n\n")?;
    remaining = after_result;
    Some(result)
  })
 }
 fn remove_skipped_item_from_beginning(s: &str) -> Option<&str> {
  // Skip commented line
  if let Some(after) = s.strip_prefix("--") {
    Some(after_first_occurence(after, "\n"))
  }
  // Skip old views and fast table triggers
  else if let Some(after) = s
    .strip_prefix("CREATE VIEW ")
    .or_else(|| s.strip_prefix("CREATE OR REPLACE VIEW "))
    .or_else(|| s.strip_prefix("CREATE MATERIALIZED VIEW "))
    .or_else(|| {
      s.strip_prefix("CREATE FUNCTION public.")
        .and_then(after_skipped_trigger_name)
        .and_then(|a| a.strip_prefix('('))
    })
    .or_else(|| {
      s.strip_prefix("CREATE TRIGGER ")
        .and_then(after_skipped_trigger_name)
        .and_then(|a| a.strip_prefix(' '))
    })
  {
    Some(after_first_occurence(after, "\n\n"))
  } else {
    None
  }
 }
 fn after_first_occurence<'a>(s: &'a str, pat: &str) -> &'a str {
  s.split_once(pat).unwrap_or_default().1
 }
 fn after_skipped_trigger_name(s: &str) -> Option<&str> {
  s.strip_prefix("refresh_comment_like")
    .or_else(|| s.strip_prefix("refresh_comment"))
    .or_else(|| s.strip_prefix("refresh_community_follower"))
    .or_else(|| s.strip_prefix("refresh_community_user_ban"))
    .or_else(|| s.strip_prefix("refresh_community"))
    .or_else(|| s.strip_prefix("refresh_post_like"))
    .or_else(|| s.strip_prefix("refresh_post"))
    .or_else(|| s.strip_prefix("refresh_private_message"))
    .or_else(|| s.strip_prefix("refresh_user"))
 }