From 8ee3e137344aca88d3acbf3a38117619865e2b00 Mon Sep 17 00:00:00 2001
From: Dario Ernst <git@kanojo.de>
Date: Thu, 26 Dec 2024 19:23:03 +0100
Subject: [PATCH] Initial working commit

---
 config.json.tmpl | 12 +++++++
 parser.py        | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  2 ++
 3 files changed, 95 insertions(+)
 create mode 100644 config.json.tmpl
 create mode 100755 parser.py
 create mode 100644 requirements.txt

diff --git a/config.json.tmpl b/config.json.tmpl
new file mode 100644
index 0000000..9dd1833
--- /dev/null
+++ b/config.json.tmpl
@@ -0,0 +1,12 @@
+{
+    "keywords": [
+    ".*mara.*",
+    ".*xenia.*",
+    ".*bz.*10.*"
+    ],
+    "to_mail": "dario@kanojo.de",
+    "from_mail": "",
+    "smtp_server": "mail.ghostdub.de",
+    "smtp_user": "",
+    "smtp_pass": ""
+}
\ No newline at end of file
diff --git a/parser.py b/parser.py
new file mode 100755
index 0000000..2a47485
--- /dev/null
+++ b/parser.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import re
+import json
+import os
+from smtplib import SMTP_SSL
+from email.mime.text import MIMEText
+from requests_html import HTMLSession
+
+
+with open("config.json",  "r") as fil:
+    config = json.load(fil)
+
+session = HTMLSession()
+r = session.get("https://www.kaffee-netz.de/forums/private-angebote-maschinen-und-muehlen.14/")
+items = r.html.find("div.structItem-title")
+
+if not os.path.isfile("matches.json"):
+    with open("matches.json",  "w") as fil:
+        json.dump({}, fil)
+
+with open("matches.json", "r") as fil:
+    matches = json.load(fil)
+
+for item in items:
+    title = item.text
+    print("\nprocessing item", item, "with title", title)
+
+    links = item.find("a")
+    link = None
+    for l in links:
+        if not "threads/" in l.attrs["href"]:
+            continue
+        link = l.attrs["href"]
+    if not l:
+        print("could not find link for item", title)
+
+    print("found link", link)
+    
+    found = False
+    for kw in config["keywords"]:
+        if re.match(kw, title, re.IGNORECASE):
+            found = True
+            break
+
+    if not found:
+        print("... no match")
+        continue
+
+    print("... match!")
+    if link in matches:
+        print("... is already known")
+        continue
+
+    matches[link] = "title"
+
+    content = session.get("https://www.kaffee-netz.de"+link)
+    description = content.html.find("div.bbWrapper", first=True).text
+
+    mail_subject = "Neues Kaffeenetz Item: " + title
+    mail_body = """
+Neues Angebot im Kaffeenetz gefunden!
+
+%s: %s
+
+Beschreibung:
+%s
+"""%(title, "https://www.kaffee-netz.de"+link, description)
+    
+    mime_body = MIMEText(mail_body.encode('utf-8'), _charset='utf-8')
+    mime_body["Subject"] = "Neues Kaffeenetz Angebot: "+title
+    mime_body["From"] = config["from_mail"]
+    mime_body["To"] = config["to_mail"]
+
+    print("Sending mail ...")
+    with SMTP_SSL("mail.ghostdub.de") as smtp:
+        smtp.login(config["smtp_user"], config["smtp_pass"])
+        smtp.sendmail(config["from_mail"], config["to_mail"], mime_body.as_string())
+
+with open("matches.json",  "w") as fil:
+    json.dump(matches, fil)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c72fb43
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+lxml_html_clean==0.4.1
+requests-html==0.10.0
\ No newline at end of file