124 lines
4.8 KiB
Python
124 lines
4.8 KiB
Python
import imaplib
|
|
import email
|
|
import os
|
|
import requests
|
|
import re
|
|
from email.header import decode_header
|
|
from dotenv import load_dotenv
|
|
from bs4 import BeautifulSoup
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime
|
|
|
|
load_dotenv()
|
|
|
|
def clean_html(raw_html):
|
|
# Permite solo etiquetas básicas
|
|
allowed_tags = ['b', 'i', 'u', 'em', 'strong', 'blockquote', 'p', 'br']
|
|
soup = BeautifulSoup(raw_html, "html.parser")
|
|
|
|
for tag in soup.find_all(True):
|
|
if tag.name not in allowed_tags:
|
|
tag.unwrap() # Elimina la etiqueta pero mantiene el texto interno
|
|
else:
|
|
tag.attrs = {} # Elimina atributos como class, style, id
|
|
|
|
return str(soup)
|
|
|
|
def get_sender_name(from_header):
|
|
# Extrae "Nombre" de "Nombre <correo@dominio.com>" o la parte antes del @
|
|
name, encoding = decode_header(from_header)[0]
|
|
if isinstance(name, bytes):
|
|
name = name.decode(encoding or "utf-8")
|
|
|
|
if "<" in name:
|
|
name = name.split("<")[0].strip()
|
|
elif "@" in name:
|
|
name = name.split("@")[0]
|
|
return name.strip()
|
|
|
|
def process_emails():
|
|
mail = imaplib.IMAP4_SSL(os.getenv("IMAP_SERVER"))
|
|
mail.login(os.getenv("IMAP_USER"), os.getenv("IMAP_PASS"))
|
|
mail.select(os.getenv("IMAP_FOLDER"))
|
|
|
|
status, messages = mail.search(None, 'ALL')
|
|
domain = os.getenv("DOMAIN_TO_SEARCH")
|
|
ignore_list = [w.strip().lower() for w in os.getenv("IGNORE_KEYWORDS").split(',')]
|
|
|
|
print(messages)
|
|
|
|
for m_id in messages[0].split():
|
|
res, msg_data = mail.fetch(m_id, '(RFC822)')
|
|
for response_part in msg_data:
|
|
if isinstance(response_part, tuple):
|
|
msg = email.message_from_bytes(response_part[1])
|
|
# 1. Obtener Asunto (Título de la página)
|
|
subject, encoding = decode_header(msg["Subject"])[0]
|
|
if isinstance(subject, bytes):
|
|
subject = subject.decode(encoding or "utf-8")
|
|
|
|
# 2. Filtrar por dominio en el asunto (URL)
|
|
if domain not in subject.lower():
|
|
continue
|
|
|
|
# 3. Obtener Remitente (Nombre sin dominio)
|
|
sender_raw = msg.get("From")
|
|
sender_name = get_sender_name(sender_raw)
|
|
|
|
# 4. Obtener Cuerpo y Limpiar HTML
|
|
body = ""
|
|
if msg.is_multipart():
|
|
for part in msg.walk():
|
|
if part.get_content_type() == "text/html":
|
|
body = part.get_payload(decode=True).decode()
|
|
break
|
|
else:
|
|
body = msg.get_payload(decode=True).decode()
|
|
|
|
# Validar ignore_list en el contenido
|
|
if any(word in body.lower() for word in ignore_list):
|
|
continue
|
|
|
|
clean_body = clean_html(body)
|
|
rss_title = f"Comentario de {sender_name} sobre {subject}"
|
|
|
|
# --- ACCIONES ---
|
|
|
|
# HTML: Negritas, cursivas, etc.
|
|
if os.getenv("SAVE_HTML") == "True":
|
|
with open(f"msg_{m_id.decode()}.html", "w", encoding="utf-8") as f:
|
|
f.write(f"<h3>Remitente: {sender_name}</h3><div>{clean_body}</div>")
|
|
|
|
# XML: Formato RSS
|
|
if os.getenv("SAVE_XML") == "True":
|
|
rss = ET.Element("rss", version="2.0")
|
|
channel = ET.SubElement(rss, "channel")
|
|
item = ET.SubElement(channel, "item")
|
|
ET.SubElement(item, "title").text = rss_title
|
|
ET.SubElement(item, "description").text = clean_body
|
|
ET.SubElement(item, "author").text = sender_name
|
|
ET.SubElement(item, "pubDate").text = datetime.now().strftime("%a, %d %b %Y %H:%M:%S GMT")
|
|
ET.ElementTree(rss).write(f"msg_{m_id.decode()}.xml", encoding="utf-8", xml_declaration=True)
|
|
|
|
# Webhook: Enviar a la URL citada en el asunto
|
|
if os.getenv("SEND_WEBHOOK") == "True":
|
|
# El asunto es la URL de la página citada
|
|
target_url = subject if "http" in subject else f"http://{subject}"
|
|
try:
|
|
requests.post(os.getenv("WP_WEBHOOK_URL"), json={
|
|
"auth_token": os.getenv("WEBHOOK_SECRET_TOKEN"),
|
|
"target_page": target_url,
|
|
"author_name": sender_name,
|
|
"comment_content": clean_body
|
|
})
|
|
except: pass
|
|
|
|
# Borrar
|
|
mail.store(m_id, '+FLAGS', '\\Deleted')
|
|
|
|
mail.expunge()
|
|
mail.logout()
|
|
|
|
if __name__ == "__main__":
|
|
process_emails()
|