import imaplib import email import os import requests import re from email.header import decode_header from dotenv import load_dotenv from bs4 import BeautifulSoup import xml.etree.ElementTree as ET from datetime import datetime from markdownify import markdownify as md from urllib.parse import urlparse from pathlib import Path load_dotenv() def clean_html(raw_html): # Permite solo etiquetas básicas allowed_tags = ['b', 'i', 'u', 'em', 'strong', 'blockquote', 'p', 'br'] soup = BeautifulSoup(raw_html, "html.parser") for tag in soup.find_all(True): if tag.name not in allowed_tags: tag.unwrap() # Elimina la etiqueta pero mantiene el texto interno else: tag.attrs = {} # Elimina atributos como class, style, id return str(soup) def get_sender_name(from_header): # Extrae "Nombre" de "Nombre " o la parte antes del @ name, encoding = decode_header(from_header)[0] if isinstance(name, bytes): name = name.decode(encoding or "utf-8") if "<" in name: name = name.split("<")[0].strip() elif "@" in name: name = name.split("@")[0] return name.strip() def process_emails(): mail = imaplib.IMAP4_SSL(os.getenv("IMAP_SERVER")) mail.login(os.getenv("IMAP_USER"), os.getenv("IMAP_PASS")) mail.select(os.getenv("IMAP_FOLDER")) status, messages = mail.search(None, 'ALL') domain = os.getenv("DOMAIN_TO_SEARCH") ignore_list = [w.strip().lower() for w in os.getenv("IGNORE_KEYWORDS").split(',')] for m_id in messages[0].split(): res, msg_data = mail.fetch(m_id, '(RFC822)') for response_part in msg_data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # 1. Obtener Asunto (Título de la página) subject, encoding = decode_header(msg["Subject"])[0] if isinstance(subject, bytes): subject = subject.decode(encoding or "utf-8") # 2. Filtrar por dominio en el asunto (URL) if domain not in subject.lower(): continue # 3. Obtener Remitente (Nombre sin dominio) sender_raw = msg.get("From") sender_name = get_sender_name(sender_raw) # 4. Obtener Cuerpo y Limpiar HTML body = "" if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == "text/html": body = part.get_payload(decode=True).decode() break else: body = msg.get_payload(decode=True).decode() # Validar ignore_list en el contenido if any(word in body.lower() for word in ignore_list): continue clean_body = clean_html(body) rss_title = f"Comentario de {sender_name} sobre {subject}" # --- ACCIONES --- if os.getenv("SAVE_MARKDOWN") == "True": with open(f"msg_{m_id.decode()}.html", "w", encoding="utf-8") as f: f.write(f"# Remitente: {sender_name}\n{md(clean_body)}") # HTML: Negritas, cursivas, etc. if os.getenv("SAVE_HTML") == "True": with open(f"msg_{m_id.decode()}.html", "w", encoding="utf-8") as f: f.write(f"

Remitente: {sender_name}

{clean_body}
") # XML: Formato RSS if os.getenv("SAVE_XML") == "True": rss = ET.Element("rss", version="2.0") channel = ET.SubElement(rss, "channel") item = ET.SubElement(channel, "item") ET.SubElement(item, "title").text = rss_title ET.SubElement(item, "description").text = clean_body ET.SubElement(item, "author").text = sender_name ET.SubElement(item, "pubDate").text = datetime.now().strftime("%a, %d %b %Y %H:%M:%S GMT") ET.ElementTree(rss).write(f"msg_{m_id.decode()}.xml", encoding="utf-8", xml_declaration=True) # Webhook: Enviar a la URL citada en el asunto if os.getenv("SEND_WEBHOOK") == "True": # El asunto es la URL de la página citada target_url = subject if "http" in subject else f"http://{subject}" try: requests.post(os.getenv("WP_WEBHOOK_URL"), json={ "auth_token": os.getenv("WEBHOOK_SECRET_TOKEN"), "target_page": target_url, "author_name": sender_name, "comment_content": clean_body }) except: pass # APPEND_HUGO if os.getenv("APPEND_HUGO") == "True": with open(f"msg_{m_id.decode()}.md", "w", encoding="utf-8") as f: markdown_text = md(clean_body).strip() comment_md = f"\n".join([f">{line}" for line in markdown_text.split("\n")]) parsed_url = urlparse(subject) ruta_relativa = parsed_url.path + (f"?{parsed_url.query}" if parsed_url.query else "") p = Path(ruta_relativa.lstrip("/")) hugo_dir = os.getenv("HUGO_DIR") directorio_destino = os.path.join(hugo_dir+"content",p.parent) archivo_md = os.path.join(directorio_destino, p.name + ".md") if os.path.exists(archivo_md): with open(archivo_md, "a", encoding="utf-8") as f: f.write(f"\n\n## {sender_name} dice:\n{comment_md}.") print(f"✅ Archivo actualizado en: {archivo_md}") else: print(f"❌ No se encontró el archivo en {archivo_md}. Verifica la estructura.") # Borrar mail.store(m_id, '+FLAGS', '\\Deleted') mail.expunge() mail.logout() if __name__ == "__main__": process_emails()