mail-comment/email-processor.py

import imaplib
import email
import os
import requests
import re
from email.header import decode_header
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from datetime import datetime

load_dotenv()

def clean_html(raw_html):
    # Permite solo etiquetas básicas
    allowed_tags = ['b', 'i', 'u', 'em', 'strong', 'blockquote', 'p', 'br']
    soup = BeautifulSoup(raw_html, "html.parser")

    for tag in soup.find_all(True):
        if tag.name not in allowed_tags:
            tag.unwrap() # Elimina la etiqueta pero mantiene el texto interno
        else:
            tag.attrs = {} # Elimina atributos como class, style, id

    return str(soup)

def get_sender_name(from_header):
    # Extrae "Nombre" de "Nombre <correo@dominio.com>" o la parte antes del @
    name, encoding = decode_header(from_header)[0]
    if isinstance(name, bytes):
        name = name.decode(encoding or "utf-8")

    if "<" in name:
        name = name.split("<")[0].strip()
    elif "@" in name:
        name = name.split("@")[0]
    return name.strip()

def process_emails():
    mail = imaplib.IMAP4_SSL(os.getenv("IMAP_SERVER"))
    mail.login(os.getenv("IMAP_USER"), os.getenv("IMAP_PASS"))
    mail.select(os.getenv("IMAP_FOLDER"))

    status, messages = mail.search(None, 'ALL')
    domain = os.getenv("DOMAIN_TO_SEARCH")
    ignore_list = [w.strip().lower() for w in os.getenv("IGNORE_KEYWORDS").split(',')]

    print(messages)

    for m_id in messages[0].split():
        res, msg_data = mail.fetch(m_id, '(RFC822)')
        for response_part in msg_data:
            if isinstance(response_part, tuple):
                msg = email.message_from_bytes(response_part[1])
                # 1. Obtener Asunto (Título de la página)
                subject, encoding = decode_header(msg["Subject"])[0]
                if isinstance(subject, bytes):
                    subject = subject.decode(encoding or "utf-8")

                # 2. Filtrar por dominio en el asunto (URL)
                if domain not in subject.lower():
                    continue

                # 3. Obtener Remitente (Nombre sin dominio)
                sender_raw = msg.get("From")
                sender_name = get_sender_name(sender_raw)

                # 4. Obtener Cuerpo y Limpiar HTML
                body = ""
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True).decode()
                            break
                else:
                    body = msg.get_payload(decode=True).decode()

                # Validar ignore_list en el contenido
                if any(word in body.lower() for word in ignore_list):
                    continue

                clean_body = clean_html(body)
                rss_title = f"Comentario de {sender_name} sobre {subject}"

                # --- ACCIONES ---

                # HTML: Negritas, cursivas, etc.
                if os.getenv("SAVE_HTML") == "True":
                    with open(f"msg_{m_id.decode()}.html", "w", encoding="utf-8") as f:
                        f.write(f"<h3>Remitente: {sender_name}</h3><div>{clean_body}</div>")

                # XML: Formato RSS
                if os.getenv("SAVE_XML") == "True":
                    rss = ET.Element("rss", version="2.0")
                    channel = ET.SubElement(rss, "channel")
                    item = ET.SubElement(channel, "item")
                    ET.SubElement(item, "title").text = rss_title
                    ET.SubElement(item, "description").text = clean_body
                    ET.SubElement(item, "author").text = sender_name
                    ET.SubElement(item, "pubDate").text = datetime.now().strftime("%a, %d %b %Y %H:%M:%S GMT")
                    ET.ElementTree(rss).write(f"msg_{m_id.decode()}.xml", encoding="utf-8", xml_declaration=True)

                # Webhook: Enviar a la URL citada en el asunto
                if os.getenv("SEND_WEBHOOK") == "True":
                    # El asunto es la URL de la página citada
                    target_url = subject if "http" in subject else f"http://{subject}"
                    try:
                        requests.post(os.getenv("WP_WEBHOOK_URL"), json={
                            "auth_token": os.getenv("WEBHOOK_SECRET_TOKEN"),
                            "target_page": target_url,
                            "author_name": sender_name,
                            "comment_content": clean_body
                        })
                    except: pass

                # Borrar
                mail.store(m_id, '+FLAGS', '\\Deleted')

    mail.expunge()
    mail.logout()

if __name__ == "__main__":
    process_emails()