ebooks/generators/markov.py

80 lines
2.2 KiB
Python

# SPDX-License-Identifier: MPL-2.0
import sqlite3
import markovify
from random import randint, choice
def make_sentence(cfg, keywords):
class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences
def test_sentence_input(self, sentence):
return True # all sentences are valid <3
db = sqlite3.connect(cfg["db_path"])
db.text_factory = str
c = db.cursor()
if cfg['learn_from_cw']:
ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")"
toots = c.execute(
f"""
SELECT content
FROM posts
WHERE
summary IS NULL
OR summary NOT IN {ignored_cws_query_params}
ORDER BY RANDOM() LIMIT 10000
""",
cfg["ignored_cws"],
).fetchall()
else:
toots = c.execute(
"""
SELECT content
FROM posts
WHERE summary IS NULL
ORDER BY RANDOM()
LIMIT 10000
""",
).fetchall()
if not toots:
raise ValueError("Database is empty! Try running main.py.")
nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed
# TODO support replicating \n in output posts instead of squashing them together
model = nlt("\n".join(toot[0].replace('\n', ' ') for toot in toots))
db.close()
if cfg['limit_length']:
sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit'])
def make_short_sentence_with_keyword(max_chars, min_chars=0, keywords=None, **kwargs):
tries = kwargs.get("tries")
for _ in range(tries):
if keywords:
try:
keyword = choice(model.word_split(keywords))
sentence = model.make_sentence_with_start(keyword, strict=False, **kwargs)
except:
sentence = model.make_sentence(**kwargs)
else:
sentence = model.make_sentence(**kwargs)
if sentence and min_chars <= len(sentence) <= max_chars:
return sentence
sentence = None
tries = 0
for tries in range(10):
if (sentence := make_short_sentence_with_keyword(
keywords=keywords if cfg['keywords_from_reply'] else None,
max_chars=500,
tries=10000,
max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7,
max_words=sentence_len if cfg['limit_length'] else None
)) is not None:
break
else:
raise ValueError("Failed 10 times to produce a sentence!")
return sentence