From ff2bbf41ca4affe3ea0d57a30e03efa734a8b0ab Mon Sep 17 00:00:00 2001
From: flamingos-cant <45780476+flamingo-cant-draw@users.noreply.github.com>
Date: Wed, 4 Dec 2024 13:29:50 +0000
Subject: [PATCH] Use magic number to detemine file type (#5225)

* Revert "Guess image mime type from file extension (fixes #5196) (#5212)"

This reverts commit 63ea99d38ade254bd428b5d031be0feec123315f.

* Use magic numbers to determine file type.

* fmt

* Don't wrap response in an option

* Regen Cargo.lock

* Clean-up + guess mime type from extension if server is unresponsive

* Move some things about.

* Some cleanup.

* Removing comment lines.

---------

Co-authored-by: Dessalines <tyhou13@gmx.com>
---
 Cargo.lock                       | 23 +++++++++++-
 crates/api_common/Cargo.toml     |  1 +
 crates/api_common/src/request.rs | 60 ++++++++++++++++++--------------
 3 files changed, 57 insertions(+), 27 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b000f9741..bf0dac6e8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -779,6 +779,17 @@ dependencies = [
  "nom",
 ]
 
+[[package]]
+name = "cfb"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
+dependencies = [
+ "byteorder",
+ "fnv",
+ "uuid",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@@ -2347,6 +2358,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "infer"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847"
+dependencies = [
+ "cfb",
+]
+
 [[package]]
 name = "inout"
 version = "0.1.3"
@@ -2507,6 +2527,7 @@ dependencies = [
  "encoding_rs",
  "enum-map",
  "futures",
+ "infer",
  "jsonwebtoken",
  "lemmy_db_schema",
  "lemmy_db_views",
@@ -2878,7 +2899,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 dependencies = [
  "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
diff --git a/crates/api_common/Cargo.toml b/crates/api_common/Cargo.toml
index 3ae14717d..74a0390ca 100644
--- a/crates/api_common/Cargo.toml
+++ b/crates/api_common/Cargo.toml
@@ -66,6 +66,7 @@ enum-map = { workspace = true }
 urlencoding = { workspace = true }
 mime = { version = "0.3.17", optional = true }
 mime_guess = "2.0.5"
+infer = "0.16.0"
 webpage = { version = "2.0", default-features = false, features = [
   "serde",
 ], optional = true }
diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs
index cc506b896..c6f86b806 100644
--- a/crates/api_common/src/request.rs
+++ b/crates/api_common/src/request.rs
@@ -23,6 +23,7 @@ use lemmy_utils::{
   REQWEST_TIMEOUT,
   VERSION,
 };
+use mime::{Mime, TEXT_HTML};
 use reqwest::{
   header::{CONTENT_TYPE, RANGE},
   Client,
@@ -63,47 +64,54 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
     .await?
     .error_for_status()?;
 
-  // In some cases servers send a wrong mime type for images, which prevents thumbnail
-  // generation. To avoid this we also try to guess the mime type from file extension.
-  let content_type = mime_guess::from_path(url.path())
-    .first()
-    // If you can guess that its an image type, then return that first.
-    .filter(|guess| guess.type_() == mime::IMAGE)
-    // Otherwise, get the content type from the headers
-    .or(
-      response
-        .headers()
-        .get(CONTENT_TYPE)
-        .and_then(|h| h.to_str().ok())
-        .and_then(|h| h.parse().ok()),
-    );
+  let mut content_type: Option<Mime> = response
+    .headers()
+    .get(CONTENT_TYPE)
+    .and_then(|h| h.to_str().ok())
+    .and_then(|h| h.parse().ok())
+    // If we don't get a content_type from the response (e.g. if the server is down),
+    // then try to infer the content_type from the file extension.
+    .or(mime_guess::from_path(url.path()).first());
 
   let opengraph_data = {
-    // if the content type is not text/html, we don't need to parse it
     let is_html = content_type
       .as_ref()
       .map(|c| {
-        (c.type_() == mime::TEXT && c.subtype() == mime::HTML)
-      ||
-      // application/xhtml+xml is a subset of HTML
-      (c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
+        // application/xhtml+xml is a subset of HTML
+        let application_xhtml: Mime = "application/xhtml+xml".parse::<Mime>().unwrap_or(TEXT_HTML);
+        let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()];
+        allowed_mime_types.contains(&c.essence_str())
       })
-      .unwrap_or(false);
-    if !is_html {
-      Default::default()
-    } else {
+      .unwrap_or_default();
+
+    if is_html {
       // Can't use .text() here, because it only checks the content header, not the actual bytes
       // https://github.com/LemmyNet/lemmy/issues/1964
-      // So we want to do deep inspection of the actually returned bytes but need to be careful not
-      // spend too much time parsing binary data as HTML
-
+      // So we want to do deep inspection of the actually returned bytes but need to be careful
+      // not spend too much time parsing binary data as HTML
       // only take first bytes regardless of how many bytes the server returns
       let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
       extract_opengraph_data(&html_bytes, url)
         .map_err(|e| info!("{e}"))
         .unwrap_or_default()
+    } else {
+      let is_octet_type = content_type
+        .as_ref()
+        .map(|c| c.subtype() == "octet-stream")
+        .unwrap_or_default();
+
+      // Overwrite the content type if its an octet type
+      if is_octet_type {
+        // Don't need to fetch as much data for this as we do with opengraph
+        let octet_bytes = collect_bytes_until_limit(response, 512).await?;
+        content_type =
+          infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok());
+      }
+
+      Default::default()
     }
   };
+
   Ok(LinkMetadata {
     opengraph_data,
     content_type: content_type.map(|c| c.to_string()),