From 8ee3e137344aca88d3acbf3a38117619865e2b00 Mon Sep 17 00:00:00 2001 From: Dario Ernst Date: Thu, 26 Dec 2024 19:23:03 +0100 Subject: [PATCH] Initial working commit --- config.json.tmpl | 12 +++++++ parser.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 3 files changed, 95 insertions(+) create mode 100644 config.json.tmpl create mode 100755 parser.py create mode 100644 requirements.txt diff --git a/config.json.tmpl b/config.json.tmpl new file mode 100644 index 0000000..9dd1833 --- /dev/null +++ b/config.json.tmpl @@ -0,0 +1,12 @@ +{ + "keywords": [ + ".*mara.*", + ".*xenia.*", + ".*bz.*10.*" + ], + "to_mail": "dario@kanojo.de", + "from_mail": "", + "smtp_server": "mail.ghostdub.de", + "smtp_user": "", + "smtp_pass": "" +} \ No newline at end of file diff --git a/parser.py b/parser.py new file mode 100755 index 0000000..2a47485 --- /dev/null +++ b/parser.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import re +import json +import os +from smtplib import SMTP_SSL +from email.mime.text import MIMEText +from requests_html import HTMLSession + + +with open("config.json", "r") as fil: + config = json.load(fil) + +session = HTMLSession() +r = session.get("https://www.kaffee-netz.de/forums/private-angebote-maschinen-und-muehlen.14/") +items = r.html.find("div.structItem-title") + +if not os.path.isfile("matches.json"): + with open("matches.json", "w") as fil: + json.dump({}, fil) + +with open("matches.json", "r") as fil: + matches = json.load(fil) + +for item in items: + title = item.text + print("\nprocessing item", item, "with title", title) + + links = item.find("a") + link = None + for l in links: + if not "threads/" in l.attrs["href"]: + continue + link = l.attrs["href"] + if not l: + print("could not find link for item", title) + + print("found link", link) + + found = False + for kw in config["keywords"]: + if re.match(kw, title, re.IGNORECASE): + found = True + break + + if not found: + print("... no match") + continue + + print("... match!") + if link in matches: + print("... is already known") + continue + + matches[link] = "title" + + content = session.get("https://www.kaffee-netz.de"+link) + description = content.html.find("div.bbWrapper", first=True).text + + mail_subject = "Neues Kaffeenetz Item: " + title + mail_body = """ +Neues Angebot im Kaffeenetz gefunden! + +%s: %s + +Beschreibung: +%s +"""%(title, "https://www.kaffee-netz.de"+link, description) + + mime_body = MIMEText(mail_body.encode('utf-8'), _charset='utf-8') + mime_body["Subject"] = "Neues Kaffeenetz Angebot: "+title + mime_body["From"] = config["from_mail"] + mime_body["To"] = config["to_mail"] + + print("Sending mail ...") + with SMTP_SSL("mail.ghostdub.de") as smtp: + smtp.login(config["smtp_user"], config["smtp_pass"]) + smtp.sendmail(config["from_mail"], config["to_mail"], mime_body.as_string()) + +with open("matches.json", "w") as fil: + json.dump(matches, fil) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c72fb43 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +lxml_html_clean==0.4.1 +requests-html==0.10.0 \ No newline at end of file