/**
 * Extracts the first email message from the HTML content by removing quoted content and signatures.
 * Sanitizes HTML to prevent XSS attacks.
 *
 * @param {string} htmlContent - The raw HTML email content.
 * @returns {string} - The cleaned and sanitized HTML content.
 */
import DOMPurify from "dompurify";

export function extractEmailContent(htmlContent) {
  if (!htmlContent) return "";

  // Initialize DOMParser
  const parser = new DOMParser();
  const doc = parser.parseFromString(htmlContent, "text/html");

  // Define selectors that indicate quoted content or signatures
  const selectorsToRemove = [
    ".gmail_quote",
    ".gmail_attr",
    ".yahoo_quoted",
    ".OutlookMessageHeader",
    "blockquote",
    '[data-marker="__QUOTED_TEXT__"]',
    "img", // Remove images as they're usually tracking pixels or signatures
    "style", // Remove style tags
  ];

  // Remove unwanted elements
  selectorsToRemove.forEach((selector) => {
    doc.querySelectorAll(selector).forEach((el) => el.remove());
  });

  // Remove any "On [date]... wrote:" lines
  const textNodes = [];
  const walker = document.createTreeWalker(
    doc.body,
    NodeFilter.SHOW_TEXT,
    null,
    false
  );

  // Collect text nodes
  let currentNode = walker.nextNode();
  while (currentNode) {
    textNodes.push(currentNode);
    currentNode = walker.nextNode();
  }

  textNodes.forEach((node) => {
    if (node.textContent.match(/On .+wrote:/)) {
      // If the node only contains this text, remove it
      if (
        node.textContent.trim() === node.textContent.match(/On .+wrote:/)[0]
      ) {
        node.remove();
      } else {
        // If there's other content, just remove the "On ... wrote:" part
        node.textContent = node.textContent.replace(/On .+wrote:/, "");
      }
    }
  });

  // Preserve <br> tags as they're important for line breaks
  // We'll only replace the ones that are redundant
  const brs = doc.querySelectorAll("br + br");
  brs.forEach((br) => {
    // Remove duplicate consecutive <br> tags
    br.remove();
  });

  // Remove all attributes from remaining elements except 'href' from links
  doc.querySelectorAll("*").forEach((el) => {
    const attrs = Array.from(el.attributes);
    attrs.forEach((attr) => {
      if (attr.name !== "href" || el.tagName.toLowerCase() !== "a") {
        el.removeAttribute(attr.name);
      }
    });
  });

  // Get the cleaned content and trim any whitespace
  let cleanedContent = doc.body.innerHTML.trim();

  // Remove any trailing "On [date]... wrote:" lines that might have been missed
  cleanedContent = cleanedContent.replace(/On .+wrote:$/, "").trim();

  // Sanitize the HTML to prevent XSS attacks
  cleanedContent = DOMPurify.sanitize(cleanedContent, {
    ALLOWED_TAGS: [
      "p",
      "div",
      "span",
      "br",
      "a",
      "ul",
      "ol",
      "li",
      "b",
      "i",
      "strong",
      "em",
    ],
    ALLOWED_ATTR: ["href"],
    USE_PROFILES: { html: true },
  });

  return cleanedContent;
}

/**
 * Process email content to better preserve line breaks and formatting
 * 
 * @param {string} htmlContent - The original HTML email content
 * @returns {string} - Processed HTML with better line break handling
 */
export function processEmailContent(htmlContent) {
  if (!htmlContent) return '';
  
  // First clean the content using the standard extractEmailContent function
  let content = extractEmailContent(htmlContent);
  
  // Additional processing to preserve line breaks
  // Replace <br> with a more visible line break and ensure proper HTML
  content = content.replace(/<br\s*\/?>/gi, '<br/>');
  
  // Ensure proper spacing around divs
  content = content.replace(/<\/div><div>/g, '</div>\n<div>');
  
  // Preserve line breaks for common signature patterns
  content = content.replace(/Thanks,\s*<br\s*\/?>/i, 'Thanks,<br/>');
  content = content.replace(/Best,\s*<br\s*\/?>/i, 'Best,<br/>');
  content = content.replace(/Regards,\s*<br\s*\/?>/i, 'Regards,<br/>');
  content = content.replace(/Best regards,\s*<br\s*\/?>/i, 'Best regards,<br/>');
  
  // Preserve line breaks around signature names
  // Match patterns like "Thanks,<br/>Name" and ensure proper spacing
  content = content.replace(/(Thanks,|Best,|Regards,|Best regards,)<br\s*\/?>\s*([A-Za-z]+)/gi, 
    (match, greeting, name) => `${greeting}<br/>${name}`);
  
  return content;
}
