fix: Run extract_opengraph_data only on first 64kB of data and if Content-Type html (#4957)

* fix: Run extract_opengraph_data only on first 64kB of data and if data is not binary. * use mime type for determination * chore: simplify collect function
2024-11-22 06:36:14 +00:00 · 2024-08-07 16:35:08 +02:00 · 2024-08-07 16:35:08 +02:00 · 606545ccaf
parent 88fbcea246
commit 606545ccaf
1 changed files with 62 additions and 8 deletions
--- a/crates/api_common/src/request.rs
+++ b/crates/api_common/src/request.rs
@ -8,6 +8,7 @@ use crate::{
 use activitypub_federation::config::Data;
 use chrono::{DateTime, Utc};
 use encoding_rs::{Encoding, UTF_8};
 use futures::StreamExt;
 use lemmy_db_schema::{
  newtypes::DbUrl,
  source::{
@ -23,7 +24,12 @@ use lemmy_utils::{
  VERSION,
 };
 use mime::Mime;
-use reqwest::{header::CONTENT_TYPE, Client, ClientBuilder};
+use reqwest::{
  header::{CONTENT_TYPE, RANGE},
  Client,
  ClientBuilder,
  Response,
 };
 use reqwest_middleware::ClientWithMiddleware;
 use serde::{Deserialize, Serialize};
 use tracing::info;
@ -44,7 +50,17 @@ pub fn client_builder(settings: &Settings) -> ClientBuilder {
 #[tracing::instrument(skip_all)]
 pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResult<LinkMetadata> {
  info!("Fetching site metadata for url: {}", url);
-  let response = context.client().get(url.as_str()).send().await?;
+  // We only fetch the first 64kB of data in order to not waste bandwidth especially for large
  // binary files
  let bytes_to_fetch = 64 * 1024;
  let response = context
    .client()
    .get(url.as_str())
    // we only need the first chunk of data. Note that we do not check for Accept-Range so the
    // server may ignore this and still respond with the full response
    .header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
    .send()
    .await?;
  let content_type: Option<Mime> = response
    .headers()
@ -52,19 +68,57 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
    .and_then(|h| h.to_str().ok())
    .and_then(|h| h.parse().ok());
  let opengraph_data = {
    // if the content type is not text/html, we don't need to parse it
    let is_html = content_type
      .as_ref()
      .map(|c| {
        (c.type_() == mime::TEXT && c.subtype() == mime::HTML)
      ||
      // application/xhtml+xml is a subset of HTML
      (c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
      })
      .unwrap_or(false);
    if !is_html {
      Default::default()
    } else {
      // Can't use .text() here, because it only checks the content header, not the actual bytes
      // https://github.com/LemmyNet/lemmy/issues/1964
-  let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
+      // So we want to do deep inspection of the actually returned bytes but need to be careful not
      // spend too much time parsing binary data as HTML
-  let opengraph_data = extract_opengraph_data(&html_bytes, url)
+      // only take first bytes regardless of how many bytes the server returns
      let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
      extract_opengraph_data(&html_bytes, url)
        .map_err(|e| info!("{e}"))
-    .unwrap_or_default();
+        .unwrap_or_default()
    }
  };
  Ok(LinkMetadata {
    opengraph_data,
    content_type: content_type.map(|c| c.to_string()),
  })
 }
 async fn collect_bytes_until_limit(
  response: Response,
  requested_bytes: usize,
 ) -> Result<Vec<u8>, LemmyError> {
  let mut stream = response.bytes_stream();
  let mut bytes = Vec::with_capacity(requested_bytes);
  while let Some(chunk) = stream.next().await {
    let chunk = chunk.map_err(LemmyError::from)?;
    // we may go over the requested size here but the important part is we don't keep aggregating
    // more chunks than needed
    bytes.extend_from_slice(&chunk);
    if bytes.len() >= requested_bytes {
      bytes.truncate(requested_bytes);
      break;
    }
  }
  Ok(bytes)
 }
 /// Generates and saves a post thumbnail and metadata.
 ///
 /// Takes a callback to generate a send activity task, so that post can be federated with metadata.