update to use crew #20
@ -1,56 +1,57 @@
|
|||||||
name: Create Blog Article if new notes exist
|
name: Create Blog Article if new notes exist
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "15 18 * * *"
|
- cron: "15 18 * * *"
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
jobs:
|
jobs:
|
||||||
prepare_blog_drafts_and_push:
|
prepare_blog_drafts_and_push:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
apt update && apt upgrade -y
|
apt update && apt upgrade -y
|
||||||
apt install rustc cargo python-is-python3 pip python3-venv python3-virtualenv libmagic-dev git -y
|
apt install rustc cargo python-is-python3 pip python3-venv python3-virtualenv libmagic-dev git -y
|
||||||
virtualenv .venv
|
virtualenv .venv
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
git config --global user.name "Blog Creator"
|
git config --global user.name "Blog Creator"
|
||||||
git config --global user.email "ridgway.infrastructure@gmail.com"
|
git config --global user.email "ridgway.infrastructure@gmail.com"
|
||||||
git config --global push.autoSetupRemote true
|
git config --global push.autoSetupRemote true
|
||||||
|
|
||||||
- name: Create .env
|
- name: Create .env
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "TRILIUM_HOST=${{ vars.TRILIUM_HOST }}" > .env
|
echo "TRILIUM_HOST=${{ vars.TRILIUM_HOST }}" > .env
|
||||||
echo "TRILIUM_PORT='${{ vars.TRILIUM_PORT }}'" >> .env
|
echo "TRILIUM_PORT='${{ vars.TRILIUM_PORT }}'" >> .env
|
||||||
echo "TRILIUM_PROTOCOL='${{ vars.TRILIUM_PROTOCOL }}'" >> .env
|
echo "TRILIUM_PROTOCOL='${{ vars.TRILIUM_PROTOCOL }}'" >> .env
|
||||||
echo "TRILIUM_PASS='${{ secrets.TRILIUM_PASS }}'" >> .env
|
echo "TRILIUM_PASS='${{ secrets.TRILIUM_PASS }}'" >> .env
|
||||||
echo "TRILIUM_TOKEN='${{ secrets.TRILIUM_TOKEN }}'" >> .env
|
echo "TRILIUM_TOKEN='${{ secrets.TRILIUM_TOKEN }}'" >> .env
|
||||||
echo "OLLAMA_PROTOCOL='${{ vars.OLLAMA_PROTOCOL }}'" >> .env
|
echo "OLLAMA_PROTOCOL='${{ vars.OLLAMA_PROTOCOL }}'" >> .env
|
||||||
echo "OLLAMA_HOST='${{ vars.OLLAMA_HOST }}'" >> .env
|
echo "OLLAMA_HOST='${{ vars.OLLAMA_HOST }}'" >> .env
|
||||||
echo "OLLAMA_PORT='${{ vars.OLLAMA_PORT }}'" >> .env
|
echo "OLLAMA_PORT='${{ vars.OLLAMA_PORT }}'" >> .env
|
||||||
echo "EMBEDDING_MODEL='${{ vars.EMBEDDING_MODEL }}'" >> .env
|
echo "EMBEDDING_MODEL='${{ vars.EMBEDDING_MODEL }}'" >> .env
|
||||||
echo "EDITOR_MODEL='${{ vars.EDITOR_MODEL }}'" >> .env
|
echo "EDITOR_MODEL='${{ vars.EDITOR_MODEL }}'" >> .env
|
||||||
export PURE='["${{ vars.CONTENT_CREATOR_MODELS_1 }}", "${{ vars.CONTENT_CREATOR_MODELS_2 }}", "${{ vars.CONTENT_CREATOR_MODELS_3 }}", "${{ vars.CONTENT_CREATOR_MODELS_4 }}"]'
|
export PURE='["${{ vars.CONTENT_CREATOR_MODELS_1 }}", "${{ vars.CONTENT_CREATOR_MODELS_2 }}", "${{ vars.CONTENT_CREATOR_MODELS_3 }}", "${{ vars.CONTENT_CREATOR_MODELS_4 }}"]'
|
||||||
echo "CONTENT_CREATOR_MODELS='$PURE'" >> .env
|
echo "CONTENT_CREATOR_MODELS='$PURE'" >> .env
|
||||||
echo "GIT_PROTOCOL='${{ vars.GIT_PROTOCOL }}'" >> .env
|
echo "GIT_PROTOCOL='${{ vars.GIT_PROTOCOL }}'" >> .env
|
||||||
echo "GIT_REMOTE='${{ vars.GIT_REMOTE }}'" >> .env
|
echo "GIT_REMOTE='${{ vars.GIT_REMOTE }}'" >> .env
|
||||||
echo "GIT_USER='${{ vars.GIT_USER }}'" >> .env
|
echo "GIT_USER='${{ vars.GIT_USER }}'" >> .env
|
||||||
echo "GIT_PASS='${{ secrets.GIT_PASS }}'" >> .env
|
echo "GIT_PASS='${{ secrets.GIT_PASS }}'" >> .env
|
||||||
echo "N8N_SECRET='${{ secrets.N8N_SECRET }}'" >> .env
|
echo "N8N_SECRET='${{ secrets.N8N_SECRET }}'" >> .env
|
||||||
echo "N8N_WEBHOOK_URL='${{ vars.N8N_WEBHOOK_URL }}'" >> .env
|
echo "N8N_WEBHOOK_URL='${{ vars.N8N_WEBHOOK_URL }}'" >> .env
|
||||||
echo "CHROMA_HOST='${{ vars.CHROMA_HOST }}'" >> .env
|
echo "CHROMA_HOST='${{ vars.CHROMA_HOST }}'" >> .env
|
||||||
echo "CHROMA_PORT='${{ vars.CHROMA_PORT }}'" >> .env
|
echo "CHROMA_PORT='${{ vars.CHROMA_PORT }}'" >> .env
|
||||||
|
echo "OLLAMA_API_KEY='${{ secrets.OLLAMA_API_KEY }}'" >> .env
|
||||||
|
|
||||||
- name: Create Blogs
|
- name: Create Blogs
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
python src/main.py
|
python src/main.py
|
||||||
|
|||||||
312
README.md
312
README.md
@ -1,64 +1,290 @@
|
|||||||
## BLOG CREATOR
|
# Blog Creator
|
||||||
|
|
||||||
This creator requires you to use a working Trilium Instance and create a .env file with the following
|
An automated blog generation system that uses CrewAI agents to research, write, and edit blog posts from Trilium notes.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The system uses three CrewAI crews orchestrated by a Flow:
|
||||||
|
|
||||||
|
1. **Research Crew** - A critical researcher agent with web search capabilities investigates the topic and produces verified findings
|
||||||
|
2. **Writing Crew** - Four creative journalist agents write draft blog articles in parallel, each with different creative styles
|
||||||
|
3. **Editor Crew** - A critical editor loads the drafts into a vector database, queries for relevant context, and produces the final polished document with metadata
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3.10 or later
|
||||||
|
- Ollama server running with required models
|
||||||
|
- ChromaDB server for vector storage
|
||||||
|
- Trilium notes instance
|
||||||
|
- Gitea instance (for automated workflows)
|
||||||
|
- n8n instance (for notifications)
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
Create a `.env` file in the project root with the following variables:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
# Trilium Configuration
|
||||||
TRILIUM_HOST=
|
TRILIUM_HOST=
|
||||||
TRILIUM_PORT=
|
TRILIUM_PORT=
|
||||||
TRILIUM_PROTOCOL=
|
TRILIUM_PROTOCOL=https
|
||||||
TRILIUM_PASS=
|
TRILIUM_PASS=
|
||||||
TRILIUM_TOKEN=
|
TRILIUM_TOKEN=
|
||||||
OLLAMA_PROTOCOL=
|
|
||||||
|
# Ollama Configuration
|
||||||
|
OLLAMA_PROTOCOL=http
|
||||||
OLLAMA_HOST=
|
OLLAMA_HOST=
|
||||||
OLLAMA_PORT=11434
|
OLLAMA_PORT=11434
|
||||||
EMBEDDING_MODEL=
|
EMBEDDING_MODEL=nomic-embed-text
|
||||||
EDITOR_MODEL=
|
EDITOR_MODEL=llama3.1:8b
|
||||||
# This is expected in python list format example `[phi4-mini:latest, qwen3:1.7b, gemma3:latest]`
|
CONTENT_CREATOR_MODELS=["phi4-mini:latest", "qwen3:1.7b", "gemma3:latest"]
|
||||||
CONTENT_CREATOR_MODELS=
|
|
||||||
CHROMA_SERVER=<IP_ADDRESS>
|
# ChromaDB Configuration
|
||||||
|
CHROMA_HOST=chroma
|
||||||
|
CHROMA_PORT=8000
|
||||||
|
|
||||||
|
# Git Configuration
|
||||||
|
GIT_USER=
|
||||||
|
GIT_PASS=
|
||||||
|
GIT_PROTOCOL=https
|
||||||
|
GIT_REMOTE=git.aridgwayweb.com/armistace/blog.git
|
||||||
|
|
||||||
|
# Notification Configuration
|
||||||
|
N8N_SECRET=
|
||||||
|
N8N_WEBHOOK_URL=
|
||||||
|
|
||||||
|
# Ollama Web Search (required for researcher agent)
|
||||||
|
OLLAMA_API_KEY=
|
||||||
```
|
```
|
||||||
|
|
||||||
This container is going to be what I use to trigger a blog creation event
|
### CONTENT_CREATOR_MODELS Format
|
||||||
|
|
||||||
To do this we will
|
The `CONTENT_CREATOR_MODELS` variable should be a JSON array of Ollama model names. Each model will be used by one of the three journalist agents. Example:
|
||||||
|
|
||||||
1. Download a Note from Trillium (I need to work out how to choose this, maybe something with a tag and then this can add a tag when it's used? each note is a seperate post, a tag to indicate if it's ready as well?)
|
|
||||||
|
|
||||||
`SELECT NOTES WHERE blog_tag = true AND used_tag = false AND ready_tag = true?`
|
|
||||||
|
|
||||||
2. Check if the ollama server is available (it's currently on a box that may not be on)
|
|
||||||
|
|
||||||
- If not on stop
|
|
||||||
|
|
||||||
3. `git pull git.aridgwayweb.com/blog`
|
|
||||||
|
|
||||||
- set up git creds: git.name = ai git.email = ridgwayinfrastructure@gmail.com get git password stored (create service user in gitea for this)
|
|
||||||
|
|
||||||
- `git config set upstream Auto true`
|
|
||||||
|
|
||||||
4. cd /src/content
|
|
||||||
|
|
||||||
5. take the information from the trillium note and prepare a 500 word blog post, insert the following at the top
|
|
||||||
|
|
||||||
```
|
```
|
||||||
Title: <title>
|
CONTENT_CREATOR_MODELS=["llama3.1:8b", "qwen2.5:7b", "phi4:latest"]
|
||||||
Date: <date post created>
|
|
||||||
Modified: <date post created>
|
|
||||||
Category: <this will come from a tag on the post (category: <category>)
|
|
||||||
Tags: <ai generated tags>, ai_content, not_human_content
|
|
||||||
Slug: <have ai write slug?>
|
|
||||||
Authors: <model name>.ai
|
|
||||||
Summary: <have ai write a 10 word summary of the post
|
|
||||||
```
|
```
|
||||||
|
|
||||||
6. write it to `<title>.md`
|
### OLLAMA_API_KEY
|
||||||
|
|
||||||
7. `git checkout -b <title>`
|
The researcher agent uses Ollama's native web search API. Create an API key from your Ollama account (https://ollama.com) and add it to your `.env` file. This uses your existing Ollama subscription for web searches.
|
||||||
|
|
||||||
8. `git add .`
|
## Project Structure
|
||||||
|
|
||||||
9. `git commit -m "<have ai write a git commit about the post>"`
|
```
|
||||||
|
blog_creator/
|
||||||
|
├── .env # Environment variables (create this)
|
||||||
|
├── .gitea/workflows/deploy.yml # Gitea Actions workflow
|
||||||
|
├── docker-compose.yml # Local development setup
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── README.md # This file
|
||||||
|
└── src/
|
||||||
|
├── main.py # Entry point
|
||||||
|
└── ai_generators/
|
||||||
|
├── ollama_md_generator.py # Main interface (used by main.py)
|
||||||
|
├── blog_flow.py # CrewAI Flow orchestrator
|
||||||
|
├── crews/
|
||||||
|
│ ├── research_crew/ # Researcher agent with web search
|
||||||
|
│ ├── writing_crew/ # Three journalist agents
|
||||||
|
│ └── editor_crew/ # Editor agent with metadata generation
|
||||||
|
└── tools/
|
||||||
|
```
|
||||||
|
|
||||||
10. `git push`
|
## Local Development Setup
|
||||||
|
|
||||||
11. Send notification via n8n to matrix for me to review?
|
### Using Docker Compose
|
||||||
|
|
||||||
|
1. Clone the repository and navigate to the project directory
|
||||||
|
|
||||||
|
2. Create your `.env` file with all required variables
|
||||||
|
|
||||||
|
3. Start the services:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
This starts:
|
||||||
|
- `blog_creator` - The main application container
|
||||||
|
- `chroma` - ChromaDB vector database
|
||||||
|
|
||||||
|
4. The container will run `main.py` automatically on startup. To run manually:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose exec blog_creator python src/main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Manual Setup (without Docker)
|
||||||
|
|
||||||
|
1. Install system dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
apt update && apt install -y rustc cargo python-is-python3 pip python3-venv libmagic-dev git
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create and activate a virtual environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Install Python dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Configure Git:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git config --global user.name "Blog Creator"
|
||||||
|
git config --global user.email "your-email@example.com"
|
||||||
|
git config --global push.autoSetupRemote true
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Run the application:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python src/main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### Trilium Integration
|
||||||
|
|
||||||
|
The system fetches notes from Trilium that are tagged for blog creation. Each note becomes one blog post. The note content is used as the basis for the AI-generated article.
|
||||||
|
|
||||||
|
### Blog Generation Flow
|
||||||
|
|
||||||
|
1. **Research Phase** - The researcher agent investigates the topic using web search, critically evaluates claims, and produces verified findings
|
||||||
|
|
||||||
|
2. **Writing Phase** - Three journalist agents write creative drafts in parallel, each with different temperature and top_p settings for variety
|
||||||
|
|
||||||
|
3. **Editor Phase** - The editor:
|
||||||
|
- Chunks and embeds all drafts into ChromaDB
|
||||||
|
- Queries the vector database for relevant context
|
||||||
|
- Generates the final polished document with metadata header
|
||||||
|
|
||||||
|
### Output Format
|
||||||
|
|
||||||
|
Each blog post includes a metadata header followed by the markdown body:
|
||||||
|
|
||||||
|
```
|
||||||
|
Title: Designing and Building an AI Enhanced CCTV System
|
||||||
|
Date: 2026-02-02 20:00
|
||||||
|
Modified: 2026-02-02 20:00
|
||||||
|
Category: Homelab
|
||||||
|
Tags: proxmox, hardware, self host, homelab, ai_content, not_human_content
|
||||||
|
Slug: ai-enhanced-cctv
|
||||||
|
Authors: phi4-mini.ai, qwen3.ai, gemma3.ai
|
||||||
|
Summary: Home CCTV Security has become a bastion of cloud subscription awfulness. This blog describes creating your own AI enhanced system.
|
||||||
|
|
||||||
|
<full markdown blog body follows>
|
||||||
|
```
|
||||||
|
|
||||||
|
The metadata fields are generated as follows:
|
||||||
|
- **Title** - From the Trilium note title
|
||||||
|
- **Date/Modified** - Current datetime when generated
|
||||||
|
- **Category** - AI-generated single word (e.g., Homelab, DevOps, Security)
|
||||||
|
- **Tags** - AI-generated relevant tags plus `ai_content, not_human_content`
|
||||||
|
- **Slug** - AI-generated URL-friendly slug
|
||||||
|
- **Authors** - Derived from CONTENT_CREATOR_MODELS (model name + `.ai`)
|
||||||
|
- **Summary** - AI-generated 15-25 word summary
|
||||||
|
|
||||||
|
### Git Workflow
|
||||||
|
|
||||||
|
After generation, the blog post is:
|
||||||
|
1. Committed to a new branch named after the slug
|
||||||
|
2. Pushed to the configured Git remote
|
||||||
|
3. A notification is sent via n8n to Matrix for review
|
||||||
|
|
||||||
|
## Gitea Actions Workflow
|
||||||
|
|
||||||
|
The `.gitea/workflows/deploy.yml` file defines an automated workflow that:
|
||||||
|
|
||||||
|
- Runs on a schedule (daily at 18:15 UTC) or on push to master branch
|
||||||
|
- Installs all dependencies
|
||||||
|
- Creates the `.env` file from Gitea secrets and variables
|
||||||
|
- Runs the blog generation script
|
||||||
|
|
||||||
|
### Setting Up Gitea Variables
|
||||||
|
|
||||||
|
In your Gitea repository settings, configure the following:
|
||||||
|
|
||||||
|
**Variables** (Repository Settings -> Variables):
|
||||||
|
- `TRILIUM_HOST` - Your Trilium server hostname
|
||||||
|
- `TRILIUM_PORT` - Trilium port
|
||||||
|
- `TRILIUM_PROTOCOL` - http or https
|
||||||
|
- `OLLAMA_PROTOCOL` - http or https
|
||||||
|
- `OLLAMA_HOST` - Ollama server hostname
|
||||||
|
- `OLLAMA_PORT` - Ollama port (default 11434)
|
||||||
|
- `EMBEDDING_MODEL` - Embedding model name
|
||||||
|
- `EDITOR_MODEL` - Editor/Researcher model name
|
||||||
|
- `CONTENT_CREATOR_MODELS_1` through `CONTENT_CREATOR_MODELS_4` - Individual model names (the workflow joins these into an array)
|
||||||
|
- `GIT_PROTOCOL` - https or ssh
|
||||||
|
- `GIT_REMOTE` - Git repository URL
|
||||||
|
- `GIT_USER` - Git username for pushing
|
||||||
|
- `N8N_WEBHOOK_URL` - n8n webhook URL for notifications
|
||||||
|
- `CHROMA_HOST` - ChromaDB hostname
|
||||||
|
- `CHROMA_PORT` - ChromaDB port
|
||||||
|
|
||||||
|
**Secrets** (Repository Settings -> Secrets):
|
||||||
|
- `TRILIUM_PASS` - Trilium password
|
||||||
|
- `TRILIUM_TOKEN` - Trilium API token
|
||||||
|
- `GIT_PASS` - Git password or personal access token
|
||||||
|
- `N8N_SECRET` - n8n webhook secret key
|
||||||
|
- `OLLAMA_API_KEY` - Ollama API key for web search
|
||||||
|
|
||||||
|
### Workflow Triggers
|
||||||
|
|
||||||
|
The workflow runs automatically when:
|
||||||
|
- A push is made to the master branch
|
||||||
|
- The scheduled cron time is reached (18:15 UTC daily)
|
||||||
|
|
||||||
|
To trigger manually, push any change to master or modify the cron schedule in `.gitea/workflows/deploy.yml`.
|
||||||
|
|
||||||
|
## Customizing Agent Behavior
|
||||||
|
|
||||||
|
Agent personalities and task instructions are defined in YAML files under `src/ai_generators/crews/*/config/`. You can modify these without changing Python code:
|
||||||
|
|
||||||
|
- `research_crew/config/agents.yaml` - Researcher role, goal, backstory
|
||||||
|
- `research_crew/config/tasks.yaml` - Research task description
|
||||||
|
- `writing_crew/config/agents.yaml` - Four journalist personalities
|
||||||
|
- `writing_crew/config/tasks.yaml` - Writing task descriptions
|
||||||
|
- `editor_crew/config/agents.yaml` - Editor role, goal, backstory
|
||||||
|
- `editor_crew/config/tasks.yaml` - Editing task and metadata format
|
||||||
|
|
||||||
|
After editing YAML files, restart the application or container to apply changes.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Ollama Connection Errors
|
||||||
|
|
||||||
|
Ensure the Ollama server is running and accessible from the blog_creator container. Check `OLLAMA_HOST` and `OLLAMA_PORT` in your `.env` file.
|
||||||
|
|
||||||
|
### ChromaDB Connection Errors
|
||||||
|
|
||||||
|
Verify ChromaDB is running and the `CHROMA_HOST` and `CHROMA_PORT` variables are correct. In Docker Compose, use `chroma` as the host name.
|
||||||
|
|
||||||
|
### Ollama Web Search Errors
|
||||||
|
|
||||||
|
If the researcher agent fails with web search errors, check that `OLLAMA_API_KEY` is set correctly. Verify your Ollama subscription is active and has web search access.
|
||||||
|
|
||||||
|
### Empty Output
|
||||||
|
|
||||||
|
If blog posts are generated but empty, check:
|
||||||
|
- Ollama models are downloaded and available
|
||||||
|
- `CONTENT_CREATOR_MODELS` contains valid model names
|
||||||
|
- Sufficient timeout for model inference (default is 30 minutes per operation)
|
||||||
|
|
||||||
|
### Git Push Failures
|
||||||
|
|
||||||
|
Verify `GIT_USER` and `GIT_PASS` are correct and the user has write access to the remote repository. Check that the remote URL in `GIT_REMOTE` is accessible.
|
||||||
|
|
||||||
|
## Development Notes
|
||||||
|
|
||||||
|
- The `main.py` entry point should not be modified for normal operation
|
||||||
|
- All AI generation logic is in `src/ai_generators/`
|
||||||
|
- The Flow pattern allows easy addition of new crews or steps
|
||||||
|
- Vector database collections are named `blog_{title}_{random_id}` and persist across runs
|
||||||
@ -3,6 +3,7 @@ trilium-py
|
|||||||
gitpython
|
gitpython
|
||||||
PyGithub
|
PyGithub
|
||||||
chromadb
|
chromadb
|
||||||
langchain-ollama
|
crewai
|
||||||
|
crewai-tools
|
||||||
PyJWT
|
PyJWT
|
||||||
dotenv
|
dotenv
|
||||||
|
|||||||
318
src/ai_generators/blog_flow.py
Normal file
318
src/ai_generators/blog_flow.py
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
"""
|
||||||
|
CrewAI Flow that orchestrates the blog-generation pipeline.
|
||||||
|
|
||||||
|
Flow
|
||||||
|
----
|
||||||
|
1. **Research crew** – a critical researcher with web-search investigates the
|
||||||
|
topic and produces verified findings.
|
||||||
|
2. **Writing crew** – four creative journalists write draft blog articles
|
||||||
|
in parallel (async tasks).
|
||||||
|
3. **Editor crew** – a critical editor loads the journalist drafts into
|
||||||
|
ChromaDB, queries for the most relevant context, and produces the final
|
||||||
|
polished markdown document complete with a metadata header (Title, Date,
|
||||||
|
Category, Tags, Slug, Authors, Summary).
|
||||||
|
|
||||||
|
The ChromaDB integration is preserved from the original implementation: each
|
||||||
|
journalist draft is chunked, embedded, and stored in a collection; the editor
|
||||||
|
receives the top-N most relevant chunks as context.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import chromadb
|
||||||
|
from crewai.flow.flow import Flow, listen, start
|
||||||
|
from ollama import Client
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
|
from ai_generators.crews.editor_crew.editor_crew import EditorCrew
|
||||||
|
from ai_generators.crews.research_crew.research_crew import ResearchCrew
|
||||||
|
from ai_generators.crews.writing_crew.writing_crew import WritingCrew
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# State
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class BlogFlowState(BaseModel):
|
||||||
|
"""Structured state for the blog generation flow."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
title: str = ""
|
||||||
|
inner_title: str = ""
|
||||||
|
content: str = ""
|
||||||
|
research_findings: str = ""
|
||||||
|
drafts: list[str] = []
|
||||||
|
final_document: str = ""
|
||||||
|
date: str = ""
|
||||||
|
authors: str = ""
|
||||||
|
category: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Flow
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class BlogFlow(Flow[BlogFlowState]):
|
||||||
|
"""Orchestrate researcher → journalists → editor via CrewAI Flows.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
flow = BlogFlow()
|
||||||
|
result = flow.kickoff(inputs={
|
||||||
|
"title": "my_blog_slug",
|
||||||
|
"inner_title": "My Blog Title",
|
||||||
|
"content": "<original content>",
|
||||||
|
})
|
||||||
|
print(result) # final markdown document
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Helpers – Ollama / ChromaDB / embedding utilities
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_ollama_url() -> str:
|
||||||
|
return (
|
||||||
|
f"{os.environ['OLLAMA_PROTOCOL']}://"
|
||||||
|
f"{os.environ['OLLAMA_HOST']}:{os.environ['OLLAMA_PORT']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_chroma_client() -> chromadb.HttpClient:
|
||||||
|
chroma_port = int(os.environ["CHROMA_PORT"])
|
||||||
|
return chromadb.HttpClient(host=os.environ["CHROMA_HOST"], port=chroma_port)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_ollama_client() -> Client:
|
||||||
|
return Client(host=BlogFlow._get_ollama_url())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _id_generator(size: int = 6) -> str:
|
||||||
|
return "".join(
|
||||||
|
random.choice(string.ascii_uppercase + string.digits) for _ in range(size)
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _split_into_chunks(text: str, chunk_size: int = 100) -> list[str]:
|
||||||
|
words = re.findall(r"\S+", text)
|
||||||
|
chunks: list[str] = []
|
||||||
|
current_chunk: list[str] = []
|
||||||
|
word_count = 0
|
||||||
|
for word in words:
|
||||||
|
current_chunk.append(word)
|
||||||
|
word_count += 1
|
||||||
|
if word_count >= chunk_size:
|
||||||
|
chunks.append(" ".join(current_chunk))
|
||||||
|
current_chunk = []
|
||||||
|
word_count = 0
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(" ".join(current_chunk))
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_embeddings(chunks: list[str]) -> list[list[float]]:
|
||||||
|
ollama_client = BlogFlow._get_ollama_client()
|
||||||
|
embed_model = os.environ["EMBEDDING_MODEL"]
|
||||||
|
try:
|
||||||
|
embeds = ollama_client.embed(model=embed_model, input=chunks)
|
||||||
|
return embeds.get("embeddings", []) # type: ignore[no-any-return]
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"Error generating embeddings: {exc}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _load_drafts_to_vector_db(self, drafts: list[str]) -> chromadb.Collection:
|
||||||
|
"""Load journalist drafts into a new ChromaDB collection and return it."""
|
||||||
|
chroma = self._get_chroma_client()
|
||||||
|
collection_name = (
|
||||||
|
f"blog_{self.state.title.lower().replace(' ', '_')}_{self._id_generator()}"
|
||||||
|
)
|
||||||
|
collection = chroma.get_or_create_collection(name=collection_name)
|
||||||
|
|
||||||
|
for i, draft in enumerate(drafts):
|
||||||
|
model_name = f"journalist_{i + 1}"
|
||||||
|
chunks = self._split_into_chunks(draft)
|
||||||
|
if not chunks or all(chunk.strip() == "" for chunk in chunks):
|
||||||
|
print(f"Skipping {model_name} – no content generated")
|
||||||
|
continue
|
||||||
|
print(f"Generating embeddings for {model_name}")
|
||||||
|
embeds = self._get_embeddings(chunks)
|
||||||
|
if not embeds:
|
||||||
|
print(f"Skipping {model_name} – no embeddings generated")
|
||||||
|
continue
|
||||||
|
if len(embeds) != len(chunks):
|
||||||
|
min_length = min(len(embeds), len(chunks))
|
||||||
|
chunks = chunks[:min_length]
|
||||||
|
embeds = embeds[:min_length]
|
||||||
|
if min_length == 0:
|
||||||
|
print(f"Skipping {model_name} – no valid content/embeddings pairs")
|
||||||
|
continue
|
||||||
|
ids = [model_name + str(j) for j in range(len(chunks))]
|
||||||
|
metadata = [{"model_agent": model_name} for _ in chunks]
|
||||||
|
print(f"Loading into collection for {model_name}")
|
||||||
|
collection.add(
|
||||||
|
documents=chunks,
|
||||||
|
embeddings=embeds, # type: ignore[arg-type]
|
||||||
|
ids=ids,
|
||||||
|
metadatas=metadata, # type: ignore[arg-type]
|
||||||
|
)
|
||||||
|
return collection
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _query_vector_db(collection: chromadb.Collection, query_text: str) -> str:
|
||||||
|
"""Query the ChromaDB collection and return the most relevant
|
||||||
|
document chunks joined as a single string."""
|
||||||
|
ollama_client = BlogFlow._get_ollama_client()
|
||||||
|
embed_model = os.environ["EMBEDDING_MODEL"]
|
||||||
|
try:
|
||||||
|
embed_result = ollama_client.embed(model=embed_model, input=query_text)
|
||||||
|
query_embed = embed_result.get("embeddings", [])
|
||||||
|
if not query_embed:
|
||||||
|
print(
|
||||||
|
"Warning: Failed to generate query embeddings, "
|
||||||
|
"falling back to empty list"
|
||||||
|
)
|
||||||
|
query_embed = [[]]
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"Error generating query embeddings: {exc}")
|
||||||
|
query_embed = [[]]
|
||||||
|
|
||||||
|
try:
|
||||||
|
query_result = collection.query(
|
||||||
|
query_embeddings=query_embed,
|
||||||
|
n_results=100, # type: ignore[arg-type]
|
||||||
|
)
|
||||||
|
documents = query_result.get("documents", [])
|
||||||
|
if documents and len(documents) > 0 and len(documents[0]) > 0:
|
||||||
|
return "\n\n".join(documents[0])
|
||||||
|
print("Warning: No relevant documents found in collection")
|
||||||
|
return "No relevant information found in drafts."
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"Error querying collection: {exc}")
|
||||||
|
return "No relevant information found in drafts due to query error."
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Flow steps
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@start()
|
||||||
|
def research(self) -> str:
|
||||||
|
"""Run the research crew to investigate the blog topic."""
|
||||||
|
print("=" * 60)
|
||||||
|
print("RESEARCH PHASE – investigating topic")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
result = (
|
||||||
|
ResearchCrew()
|
||||||
|
.crew()
|
||||||
|
.kickoff(
|
||||||
|
inputs={
|
||||||
|
"inner_title": self.state.inner_title,
|
||||||
|
"content": self.state.content,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.state.research_findings = result.raw
|
||||||
|
print("Research phase complete")
|
||||||
|
return result.raw
|
||||||
|
|
||||||
|
@listen(research)
|
||||||
|
def write_drafts(self, research_findings: str) -> list[str]:
|
||||||
|
"""Run the writing crew (4 journalists in parallel) and collect
|
||||||
|
their draft outputs."""
|
||||||
|
print("=" * 60)
|
||||||
|
print("WRITING PHASE – 4 journalists drafting in parallel")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
result = (
|
||||||
|
WritingCrew()
|
||||||
|
.crew()
|
||||||
|
.kickoff(
|
||||||
|
inputs={
|
||||||
|
"inner_title": self.state.inner_title,
|
||||||
|
"content": self.state.content,
|
||||||
|
"research_findings": research_findings,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Collect all draft outputs from the crew's task outputs
|
||||||
|
drafts: list[str] = []
|
||||||
|
for task_output in result.tasks_output:
|
||||||
|
drafts.append(task_output.raw)
|
||||||
|
|
||||||
|
self.state.drafts = drafts
|
||||||
|
print(f"Writing phase complete – {len(drafts)} drafts produced")
|
||||||
|
return drafts
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _compute_authors() -> str:
|
||||||
|
"""Build an author string from the CONTENT_CREATOR_MODELS env var.
|
||||||
|
|
||||||
|
Each model name is stripped of any tag suffix (e.g. ``:latest``)
|
||||||
|
and ``.ai`` is appended. Multiple models are joined with ``', '``.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
models = json.loads(os.environ["CONTENT_CREATOR_MODELS"])
|
||||||
|
except (KeyError, json.JSONDecodeError):
|
||||||
|
models = []
|
||||||
|
authors = ", ".join(m.split(":")[0].split("/")[-1] + ".ai" for m in models)
|
||||||
|
return authors or "unknown.ai"
|
||||||
|
|
||||||
|
@listen(write_drafts)
|
||||||
|
def edit_final(self, drafts: list[str]) -> str:
|
||||||
|
"""Load journalist drafts into the vector DB, query for the most
|
||||||
|
relevant context, and run the editor crew to produce the final
|
||||||
|
polished document with a metadata header."""
|
||||||
|
print("=" * 60)
|
||||||
|
print("EDITOR PHASE – producing final document")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# ---- Compute date and authors for the metadata header ----
|
||||||
|
if not self.state.date:
|
||||||
|
self.state.date = datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||||
|
self.state.authors = self._compute_authors()
|
||||||
|
if not self.state.category:
|
||||||
|
self.state.category = "<pick one word that best describes the topic, e.g. Homelab, DevOps, Security, Networking>"
|
||||||
|
|
||||||
|
# ---- Vector DB integration ----
|
||||||
|
print("Loading drafts into vector database")
|
||||||
|
collection = self._load_drafts_to_vector_db(drafts)
|
||||||
|
|
||||||
|
# Build the editor's brief so we can query the vector DB with it
|
||||||
|
editor_brief = (
|
||||||
|
f"You are an editor taking information from 3 Software "
|
||||||
|
f"Developers and Data experts writing a 5000 word blog article. "
|
||||||
|
f"You like when they use almost no code examples. "
|
||||||
|
f"You are also Australian. The title for the blog is "
|
||||||
|
f"{self.state.inner_title}. "
|
||||||
|
f"The basis for the content of the blog is: "
|
||||||
|
f"<blog>{self.state.content}</blog>"
|
||||||
|
)
|
||||||
|
draft_context = self._query_vector_db(collection, editor_brief)
|
||||||
|
print("Showing pertinent info from drafts used in final edited edition")
|
||||||
|
|
||||||
|
# ---- Editor crew ----
|
||||||
|
result = (
|
||||||
|
EditorCrew()
|
||||||
|
.crew()
|
||||||
|
.kickoff(
|
||||||
|
inputs={
|
||||||
|
"inner_title": self.state.inner_title,
|
||||||
|
"content": self.state.content,
|
||||||
|
"draft_context": draft_context,
|
||||||
|
"date": self.state.date,
|
||||||
|
"authors": self.state.authors,
|
||||||
|
"category": self.state.category,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.state.final_document = result.raw
|
||||||
|
print("Editor phase complete")
|
||||||
|
return result.raw
|
||||||
0
src/ai_generators/crews/__init__.py
Normal file
0
src/ai_generators/crews/__init__.py
Normal file
0
src/ai_generators/crews/editor_crew/__init__.py
Normal file
0
src/ai_generators/crews/editor_crew/__init__.py
Normal file
20
src/ai_generators/crews/editor_crew/config/agents.yaml
Normal file
20
src/ai_generators/crews/editor_crew/config/agents.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
editor:
|
||||||
|
role: >
|
||||||
|
Critical Blog Editor
|
||||||
|
goal: >
|
||||||
|
Produce the final, polished ~5000-word version of a blog about {inner_title},
|
||||||
|
complete with a metadata header (Title, Date, Category, Tags, Slug, Authors,
|
||||||
|
Summary)
|
||||||
|
backstory: >
|
||||||
|
You are an editor taking information from 3 Software Developers and
|
||||||
|
Data experts writing a 5000 word blog article. You like when they use
|
||||||
|
almost no code examples. You are also Australian. The content may have
|
||||||
|
light comedic elements; you are more professional and will attempt to
|
||||||
|
tone these down. You are critical of repeated sentences, inconsistencies,
|
||||||
|
and weak arguments. You ensure the final document is cohesive,
|
||||||
|
well-structured, and publication-ready. You never leave placeholder
|
||||||
|
text — every section must contain finished content. You always begin
|
||||||
|
your output with a plain-text metadata block (Title, Date, Modified,
|
||||||
|
Category, Tags, Slug, Authors, Summary) followed by a blank line and
|
||||||
|
then the full markdown body. You generate sensible Category, Tags,
|
||||||
|
Slug and Summary values based on the blog content.
|
||||||
45
src/ai_generators/crews/editor_crew/config/tasks.yaml
Normal file
45
src/ai_generators/crews/editor_crew/config/tasks.yaml
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
edit_task:
|
||||||
|
description: >
|
||||||
|
Generate the final, 5000 word blog post using this information
|
||||||
|
from the journalist drafts:
|
||||||
|
<context>{draft_context}</context>
|
||||||
|
|
||||||
|
You are an editor taking information from 3 Software Developers and
|
||||||
|
Data experts writing a 5000 word blog article. You like when they use
|
||||||
|
almost no code examples. You are also Australian. The content may have
|
||||||
|
light comedic elements; you are more professional and will attempt to
|
||||||
|
tone these down. As this person produce the final version of this blog
|
||||||
|
as a markdown document keeping in mind the context provided by the
|
||||||
|
previous drafts. You are to produce the content not placeholders for
|
||||||
|
further editors. The title for the blog is {inner_title}. Avoid
|
||||||
|
repeated sentences. The basis for the content of the blog is:
|
||||||
|
<blog>{content}</blog>
|
||||||
|
|
||||||
|
IMPORTANT: The output MUST start with a metadata block in exactly this
|
||||||
|
format, followed by a blank line, then the blog body. Do not wrap the
|
||||||
|
metadata block in code fences or any other markup. Generate sensible
|
||||||
|
values for Category, Tags, Slug and Summary based on the blog content.
|
||||||
|
|
||||||
|
Title: {inner_title}
|
||||||
|
Date: {date}
|
||||||
|
Modified: {date}
|
||||||
|
Category: {category}
|
||||||
|
Tags: <generate 3-5 short lowercase tags relevant to the content>, ai_content, not_human_content
|
||||||
|
Slug: <generate a short URL-friendly slug using lowercase words separated by hyphens>
|
||||||
|
Authors: {authors}
|
||||||
|
Summary: <write a single sentence summary of roughly 15-25 words>
|
||||||
|
|
||||||
|
After the metadata block and blank line, write the full blog body in
|
||||||
|
markdown. Do not repeat the title as a heading in the body.
|
||||||
|
|
||||||
|
- Only output the metadata block and then the markdown body.
|
||||||
|
- Do not wrap in markdown code fences.
|
||||||
|
- Do not provide a commentary on the drafts in the context.
|
||||||
|
- Produce real content, not placeholders for further editors.
|
||||||
|
- Avoid repeated sentences.
|
||||||
|
expected_output: >
|
||||||
|
A metadata block (Title, Date, Modified, Category, Tags, Slug, Authors,
|
||||||
|
Summary) followed by a blank line and then a polished ~5000-word markdown
|
||||||
|
blog article about {inner_title}. No commentary. No placeholders. Cohesive
|
||||||
|
and publication-ready.
|
||||||
|
agent: editor
|
||||||
51
src/ai_generators/crews/editor_crew/editor_crew.py
Normal file
51
src/ai_generators/crews/editor_crew/editor_crew.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
"""Editor crew – produces the final polished blog document."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from crewai import LLM, Agent, Crew, Process, Task
|
||||||
|
from crewai.project import CrewBase, agent, crew, task
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ollama_url() -> str:
|
||||||
|
return (
|
||||||
|
f"{os.environ['OLLAMA_PROTOCOL']}://"
|
||||||
|
f"{os.environ['OLLAMA_HOST']}:{os.environ['OLLAMA_PORT']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@CrewBase
|
||||||
|
class EditorCrew:
|
||||||
|
"""Crew with a single critical editor who produces the final blog."""
|
||||||
|
|
||||||
|
agents_config = "config/agents.yaml"
|
||||||
|
tasks_config = "config/tasks.yaml"
|
||||||
|
|
||||||
|
@agent
|
||||||
|
def editor(self) -> Agent:
|
||||||
|
return Agent(
|
||||||
|
config=self.agents_config["editor"], # type: ignore[index]
|
||||||
|
llm=LLM(
|
||||||
|
model=f"ollama/{os.environ['EDITOR_MODEL']}",
|
||||||
|
base_url=_get_ollama_url(),
|
||||||
|
temperature=0.6,
|
||||||
|
top_p=0.5,
|
||||||
|
),
|
||||||
|
verbose=True,
|
||||||
|
max_iter=30,
|
||||||
|
respect_context_window=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def edit_task(self) -> Task:
|
||||||
|
return Task(
|
||||||
|
config=self.tasks_config["edit_task"], # type: ignore[index]
|
||||||
|
)
|
||||||
|
|
||||||
|
@crew
|
||||||
|
def crew(self) -> Crew:
|
||||||
|
return Crew(
|
||||||
|
agents=self.agents,
|
||||||
|
tasks=self.tasks,
|
||||||
|
process=Process.sequential,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
0
src/ai_generators/crews/research_crew/__init__.py
Normal file
0
src/ai_generators/crews/research_crew/__init__.py
Normal file
15
src/ai_generators/crews/research_crew/config/agents.yaml
Normal file
15
src/ai_generators/crews/research_crew/config/agents.yaml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
researcher:
|
||||||
|
role: >
|
||||||
|
Critical Technology Researcher
|
||||||
|
goal: >
|
||||||
|
Research and critically evaluate information related to {inner_title}
|
||||||
|
backstory: >
|
||||||
|
You are a skeptical, thorough technology researcher with years of
|
||||||
|
experience in Software Development and DevOps. You never accept
|
||||||
|
information at face value and always cross-reference claims with
|
||||||
|
multiple sources. You are particularly critical of hype, marketing
|
||||||
|
language, and unsubstantiated technical claims. You prefer primary
|
||||||
|
sources, official documentation, and peer-reviewed material over
|
||||||
|
blog posts and opinion pieces. When conflicting information is found
|
||||||
|
you clearly note the discrepancy and provide both viewpoints with
|
||||||
|
credibility assessments.
|
||||||
23
src/ai_generators/crews/research_crew/config/tasks.yaml
Normal file
23
src/ai_generators/crews/research_crew/config/tasks.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
research_task:
|
||||||
|
description: >
|
||||||
|
Research the topic: {inner_title}
|
||||||
|
|
||||||
|
The original content to research and expand upon is:
|
||||||
|
<blog>{content}</blog>
|
||||||
|
|
||||||
|
Your task is to:
|
||||||
|
1. Search the web for current, accurate information related to this topic.
|
||||||
|
2. Critically evaluate the claims made in the original content.
|
||||||
|
3. Find supporting or contradicting evidence from reputable sources.
|
||||||
|
4. Identify any outdated information, common misconceptions, or factual errors.
|
||||||
|
5. Provide a comprehensive research summary with verified facts, clearly
|
||||||
|
distinguishing between confirmed information and areas of uncertainty.
|
||||||
|
|
||||||
|
Be thorough and skeptical. Only include information you can verify from
|
||||||
|
reliable sources. Flag anything that seems exaggerated or unverified.
|
||||||
|
expected_output: >
|
||||||
|
A comprehensive research report with verified facts, source citations,
|
||||||
|
and credibility assessments. Clearly distinguish between confirmed
|
||||||
|
information and areas of uncertainty. Include supporting and
|
||||||
|
contradicting evidence where found.
|
||||||
|
agent: researcher
|
||||||
54
src/ai_generators/crews/research_crew/research_crew.py
Normal file
54
src/ai_generators/crews/research_crew/research_crew.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
"""Research crew – investigates a blog topic using web search."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from crewai import LLM, Agent, Crew, Process, Task
|
||||||
|
from crewai.project import CrewBase, agent, crew, task
|
||||||
|
|
||||||
|
from ai_generators.tools import OllamaWebSearchTool
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ollama_url() -> str:
|
||||||
|
return (
|
||||||
|
f"{os.environ['OLLAMA_PROTOCOL']}://"
|
||||||
|
f"{os.environ['OLLAMA_HOST']}:{os.environ['OLLAMA_PORT']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@CrewBase
|
||||||
|
class ResearchCrew:
|
||||||
|
"""Crew that researches a blog topic with a critical, web-searching
|
||||||
|
researcher agent."""
|
||||||
|
|
||||||
|
agents_config = "config/agents.yaml"
|
||||||
|
tasks_config = "config/tasks.yaml"
|
||||||
|
|
||||||
|
@agent
|
||||||
|
def researcher(self) -> Agent:
|
||||||
|
return Agent(
|
||||||
|
config=self.agents_config["researcher"], # type: ignore[index]
|
||||||
|
tools=[OllamaWebSearchTool()],
|
||||||
|
llm=LLM(
|
||||||
|
model=f"ollama/{os.environ['EDITOR_MODEL']}",
|
||||||
|
base_url=_get_ollama_url(),
|
||||||
|
temperature=0.3,
|
||||||
|
),
|
||||||
|
verbose=True,
|
||||||
|
max_iter=25,
|
||||||
|
respect_context_window=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def research_task(self) -> Task:
|
||||||
|
return Task(
|
||||||
|
config=self.tasks_config["research_task"], # type: ignore[index]
|
||||||
|
)
|
||||||
|
|
||||||
|
@crew
|
||||||
|
def crew(self) -> Crew:
|
||||||
|
return Crew(
|
||||||
|
agents=self.agents,
|
||||||
|
tasks=self.tasks,
|
||||||
|
process=Process.sequential,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
0
src/ai_generators/crews/writing_crew/__init__.py
Normal file
0
src/ai_generators/crews/writing_crew/__init__.py
Normal file
48
src/ai_generators/crews/writing_crew/config/agents.yaml
Normal file
48
src/ai_generators/crews/writing_crew/config/agents.yaml
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
journalist_one:
|
||||||
|
role: >
|
||||||
|
Creative Technology Journalist
|
||||||
|
goal: >
|
||||||
|
Write a creative, engaging ~5000-word draft blog article about {inner_title}
|
||||||
|
backstory: >
|
||||||
|
You are a journalist, Software Developer and DevOps expert writing a
|
||||||
|
draft blog article for other tech enthusiasts. You like to use almost no
|
||||||
|
code examples and prefer to talk in a light comedic tone. You are also
|
||||||
|
Australian. You favour vivid analogies and storytelling to explain
|
||||||
|
technical concepts. Your writing is warm, slightly irreverent, and
|
||||||
|
accessible.
|
||||||
|
|
||||||
|
journalist_two:
|
||||||
|
role: >
|
||||||
|
Creative Technology Journalist
|
||||||
|
goal: >
|
||||||
|
Write a creative, engaging ~5000-word draft blog article about {inner_title}
|
||||||
|
backstory: >
|
||||||
|
You are a journalist, Software Developer and DevOps expert writing a
|
||||||
|
draft blog article for other tech enthusiasts. You like to use almost no
|
||||||
|
code examples and prefer to talk in a light comedic tone. You are also
|
||||||
|
Australian. You lean into sharp wit and concise, punchy sentences. You
|
||||||
|
love finding unexpected connections between seemingly unrelated topics.
|
||||||
|
|
||||||
|
journalist_three:
|
||||||
|
role: >
|
||||||
|
Creative Technology Journalist
|
||||||
|
goal: >
|
||||||
|
Write a creative, engaging ~5000-word draft blog article about {inner_title}
|
||||||
|
backstory: >
|
||||||
|
You are a journalist, Software Developer and DevOps expert writing a
|
||||||
|
draft blog article for other tech enthusiasts. You like to use almost no
|
||||||
|
code examples and prefer to talk in a light comedic tone. You are also
|
||||||
|
Australian. You prefer a conversational, meandering style that draws the
|
||||||
|
reader in with personal anecdotes and rhetorical questions.
|
||||||
|
|
||||||
|
journalist_four:
|
||||||
|
role: >
|
||||||
|
Creative Technology Journalist
|
||||||
|
goal: >
|
||||||
|
Write a creative, engaging ~5000-word draft blog article about {inner_title}
|
||||||
|
backstory: >
|
||||||
|
You are a journalist, Software Developer and DevOps expert writing a
|
||||||
|
draft blog article for other tech enthusiasts. You like to use almost no
|
||||||
|
code examples and prefer to talk in a light comedic tone. You are also
|
||||||
|
Australian. You take a methodical, analytical approach with detailed
|
||||||
|
explanations and systematic breakdowns of complex topics.
|
||||||
79
src/ai_generators/crews/writing_crew/config/tasks.yaml
Normal file
79
src/ai_generators/crews/writing_crew/config/tasks.yaml
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
write_draft_one:
|
||||||
|
description: >
|
||||||
|
Write a 5000 word draft blog article as a markdown document.
|
||||||
|
The title for the blog is {inner_title}.
|
||||||
|
Do not output the title in the markdown.
|
||||||
|
|
||||||
|
The basis for the content of the blog is:
|
||||||
|
<blog>{content}</blog>
|
||||||
|
|
||||||
|
Research findings to incorporate and validate against:
|
||||||
|
<research>{research_findings}</research>
|
||||||
|
|
||||||
|
Write creatively, with a light comedic tone. You are Australian.
|
||||||
|
Use almost no code examples. Make it engaging for tech enthusiasts.
|
||||||
|
Only output the markdown content — no commentary, no meta-description.
|
||||||
|
expected_output: >
|
||||||
|
A ~5000-word markdown draft blog article about {inner_title}.
|
||||||
|
No title in the output. No commentary or meta-description.
|
||||||
|
agent: journalist_one
|
||||||
|
|
||||||
|
write_draft_two:
|
||||||
|
description: >
|
||||||
|
Write a 5000 word draft blog article as a markdown document.
|
||||||
|
The title for the blog is {inner_title}.
|
||||||
|
Do not output the title in the markdown.
|
||||||
|
|
||||||
|
The basis for the content of the blog is:
|
||||||
|
<blog>{content}</blog>
|
||||||
|
|
||||||
|
Research findings to incorporate and validate against:
|
||||||
|
<research>{research_findings}</research>
|
||||||
|
|
||||||
|
Write creatively, with a light comedic tone. You are Australian.
|
||||||
|
Use almost no code examples. Make it engaging for tech enthusiasts.
|
||||||
|
Only output the markdown content — no commentary, no meta-description.
|
||||||
|
expected_output: >
|
||||||
|
A ~5000-word markdown draft blog article about {inner_title}.
|
||||||
|
No title in the output. No commentary or meta-description.
|
||||||
|
agent: journalist_two
|
||||||
|
|
||||||
|
write_draft_three:
|
||||||
|
description: >
|
||||||
|
Write a 5000 word draft blog article as a markdown document.
|
||||||
|
The title for the blog is {inner_title}.
|
||||||
|
Do not output the title in the markdown.
|
||||||
|
|
||||||
|
The basis for the content of the blog is:
|
||||||
|
<blog>{content}</blog>
|
||||||
|
|
||||||
|
Research findings to incorporate and validate against:
|
||||||
|
<research>{research_findings}</research>
|
||||||
|
|
||||||
|
Write creatively, with a light comedic tone. You are Australian.
|
||||||
|
Use almost no code examples. Make it engaging for tech enthusiasts.
|
||||||
|
Only output the markdown content — no commentary, no meta-description.
|
||||||
|
expected_output: >
|
||||||
|
A ~5000-word markdown draft blog article about {inner_title}.
|
||||||
|
No title in the output. No commentary or meta-description.
|
||||||
|
agent: journalist_three
|
||||||
|
|
||||||
|
write_draft_four:
|
||||||
|
description: >
|
||||||
|
Write a 5000 word draft blog article as a markdown document.
|
||||||
|
The title for the blog is {inner_title}.
|
||||||
|
Do not output the title in the markdown.
|
||||||
|
|
||||||
|
The basis for the content of the blog is:
|
||||||
|
<blog>{content}</blog>
|
||||||
|
|
||||||
|
Research findings to incorporate and validate against:
|
||||||
|
<research>{research_findings}</research>
|
||||||
|
|
||||||
|
Write creatively, with a light comedic tone. You are Australian.
|
||||||
|
Use almost no code examples. Make it engaging for tech enthusiasts.
|
||||||
|
Only output the markdown content — no commentary, no meta-description.
|
||||||
|
expected_output: >
|
||||||
|
A ~5000-word markdown draft blog article about {inner_title}.
|
||||||
|
No title in the output. No commentary or meta-description.
|
||||||
|
agent: journalist_four
|
||||||
128
src/ai_generators/crews/writing_crew/writing_crew.py
Normal file
128
src/ai_generators/crews/writing_crew/writing_crew.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
"""Writing crew – three journalists who write creative blog drafts in parallel."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from crewai import LLM, Agent, Crew, Process, Task
|
||||||
|
from crewai.project import CrewBase, agent, crew, task
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ollama_url() -> str:
|
||||||
|
return (
|
||||||
|
f"{os.environ['OLLAMA_PROTOCOL']}://"
|
||||||
|
f"{os.environ['OLLAMA_HOST']}:{os.environ['OLLAMA_PORT']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_agent_models() -> list[str]:
|
||||||
|
return json.loads(os.environ["CONTENT_CREATOR_MODELS"])
|
||||||
|
|
||||||
|
|
||||||
|
# Creative-style presets per journalist: (temperature, top_p)
|
||||||
|
_JOURNALIST_PARAMS: dict[int, tuple[float, float]] = {
|
||||||
|
1: (0.70, 0.60), # moderate creativity
|
||||||
|
2: (0.85, 0.50), # high creativity, tighter focus
|
||||||
|
3: (0.60, 0.70), # lower creativity, wider associations
|
||||||
|
4: (0.50, 0.80), # methodical, analytical approach
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@CrewBase
|
||||||
|
class WritingCrew:
|
||||||
|
"""Crew of three creative journalists who write blog drafts in parallel."""
|
||||||
|
|
||||||
|
agents_config = "config/agents.yaml"
|
||||||
|
tasks_config = "config/tasks.yaml"
|
||||||
|
|
||||||
|
# ---- helpers ----
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _journalist_llm(index: int) -> LLM:
|
||||||
|
models = _get_agent_models()
|
||||||
|
model = models[index % len(models)]
|
||||||
|
temp, top_p = _JOURNALIST_PARAMS[index + 1]
|
||||||
|
return LLM(
|
||||||
|
model=f"ollama/{model}",
|
||||||
|
base_url=_get_ollama_url(),
|
||||||
|
temperature=temp,
|
||||||
|
top_p=top_p,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- agents ----
|
||||||
|
|
||||||
|
@agent
|
||||||
|
def journalist_one(self) -> Agent:
|
||||||
|
return Agent(
|
||||||
|
config=self.agents_config["journalist_one"], # type: ignore[index]
|
||||||
|
llm=self._journalist_llm(0),
|
||||||
|
verbose=True,
|
||||||
|
max_iter=30,
|
||||||
|
respect_context_window=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@agent
|
||||||
|
def journalist_two(self) -> Agent:
|
||||||
|
return Agent(
|
||||||
|
config=self.agents_config["journalist_two"], # type: ignore[index]
|
||||||
|
llm=self._journalist_llm(1),
|
||||||
|
verbose=True,
|
||||||
|
max_iter=30,
|
||||||
|
respect_context_window=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@agent
|
||||||
|
def journalist_three(self) -> Agent:
|
||||||
|
return Agent(
|
||||||
|
config=self.agents_config["journalist_three"], # type: ignore[index]
|
||||||
|
llm=self._journalist_llm(2),
|
||||||
|
verbose=True,
|
||||||
|
max_iter=30,
|
||||||
|
respect_context_window=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@agent
|
||||||
|
def journalist_four(self) -> Agent:
|
||||||
|
return Agent(
|
||||||
|
config=self.agents_config["journalist_four"], # type: ignore[index]
|
||||||
|
llm=self._journalist_llm(3),
|
||||||
|
verbose=True,
|
||||||
|
max_iter=30,
|
||||||
|
respect_context_window=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- tasks ----
|
||||||
|
|
||||||
|
@task
|
||||||
|
def write_draft_one(self) -> Task:
|
||||||
|
return Task(
|
||||||
|
config=self.tasks_config["write_draft_one"], # type: ignore[index]
|
||||||
|
)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def write_draft_two(self) -> Task:
|
||||||
|
return Task(
|
||||||
|
config=self.tasks_config["write_draft_two"], # type: ignore[index]
|
||||||
|
)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def write_draft_three(self) -> Task:
|
||||||
|
return Task(
|
||||||
|
config=self.tasks_config["write_draft_three"], # type: ignore[index]
|
||||||
|
)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def write_draft_four(self) -> Task:
|
||||||
|
return Task(
|
||||||
|
config=self.tasks_config["write_draft_four"], # type: ignore[index]
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- crew ----
|
||||||
|
|
||||||
|
@crew
|
||||||
|
def crew(self) -> Crew:
|
||||||
|
return Crew(
|
||||||
|
agents=self.agents,
|
||||||
|
tasks=self.tasks,
|
||||||
|
process=Process.sequential,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
@ -1,352 +1,147 @@
|
|||||||
|
"""
|
||||||
|
OllamaGenerator – public interface for blog generation.
|
||||||
|
|
||||||
|
This module preserves the same API that ``main.py`` relies on while
|
||||||
|
delegating the heavy lifting to a CrewAI Flow (``blog_flow.BlogFlow``)
|
||||||
|
that orchestrates a researcher, four journalists, and an editor via
|
||||||
|
YAML-configured crews.
|
||||||
|
|
||||||
|
Breaking changes from the previous implementation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
* ``langchain-ollama`` is no longer required – the ``generate_system_message``
|
||||||
|
helper now talks directly to the Ollama HTTP API via the ``ollama`` client.
|
||||||
|
* Internally, blog generation is driven by CrewAI agents, crews and a Flow
|
||||||
|
rather than by hand-rolled retry loops and thread-pool executors.
|
||||||
|
|
||||||
|
Public interface (unchanged)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
* ``OllamaGenerator(title, content, inner_title)``
|
||||||
|
* ``save_to_file(filename)`` – generates the blog and writes it to disk
|
||||||
|
* ``generate_system_message(prompt_system, prompt_human)`` – simple LLM call
|
||||||
|
* ``self.response`` – the final markdown text (populated after ``save_to_file``)
|
||||||
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import string
|
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor, TimeoutError
|
from concurrent.futures import ThreadPoolExecutor, TimeoutError
|
||||||
|
|
||||||
import chromadb
|
|
||||||
from langchain_ollama import ChatOllama
|
|
||||||
from ollama import Client
|
from ollama import Client
|
||||||
|
|
||||||
|
from ai_generators.blog_flow import BlogFlow
|
||||||
|
|
||||||
|
|
||||||
class OllamaGenerator:
|
class OllamaGenerator:
|
||||||
def __init__(self, title: str, content: str, inner_title: str):
|
"""Generate a polished blog post from raw content using CrewAI agents.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
title : str
|
||||||
|
An OS-friendly slug used for file names and ChromaDB collection
|
||||||
|
names (e.g. ``"my_blog_title"``).
|
||||||
|
content : str
|
||||||
|
The raw source content that the blog should be based on.
|
||||||
|
inner_title : str
|
||||||
|
The human-readable blog title (used in prompts and output).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
title: str,
|
||||||
|
content: str,
|
||||||
|
inner_title: str,
|
||||||
|
date: str | None = None,
|
||||||
|
category: str | None = None,
|
||||||
|
):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.inner_title = inner_title
|
self.inner_title = inner_title
|
||||||
self.content = content
|
self.content = content
|
||||||
self.response = None
|
self.date = date
|
||||||
print("In Class")
|
self.category = category
|
||||||
print(os.environ["CONTENT_CREATOR_MODELS"])
|
self.response: str | None = None
|
||||||
try:
|
|
||||||
chroma_port = int(os.environ["CHROMA_PORT"])
|
# ---- Ollama connection (used by generate_system_message) ----
|
||||||
except ValueError as e:
|
ollama_url = (
|
||||||
raise Exception(f"CHROMA_PORT is not an integer: {e}")
|
f"{os.environ['OLLAMA_PROTOCOL']}://"
|
||||||
self.chroma = chromadb.HttpClient(
|
f"{os.environ['OLLAMA_HOST']}:{os.environ['OLLAMA_PORT']}"
|
||||||
host=os.environ["CHROMA_HOST"], port=chroma_port
|
|
||||||
)
|
)
|
||||||
ollama_url = f"{os.environ['OLLAMA_PROTOCOL']}://{os.environ['OLLAMA_HOST']}:{os.environ['OLLAMA_PORT']}"
|
|
||||||
self.ollama_client = Client(host=ollama_url)
|
self.ollama_client = Client(host=ollama_url)
|
||||||
self.ollama_model = os.environ["EDITOR_MODEL"]
|
self.ollama_model = os.environ["EDITOR_MODEL"]
|
||||||
self.embed_model = os.environ["EMBEDDING_MODEL"]
|
|
||||||
self.agent_models = json.loads(os.environ["CONTENT_CREATOR_MODELS"])
|
|
||||||
self.llm = ChatOllama(
|
|
||||||
model=self.ollama_model, temperature=0.6, top_p=0.5
|
|
||||||
) # This is the level head in the room
|
|
||||||
self.prompt_inject = f"""
|
|
||||||
You are a journalist, Software Developer and DevOps expert
|
|
||||||
writing a 5000 word draft blog article for other tech enthusiasts.
|
|
||||||
You like to use almost no code examples and prefer to talk
|
|
||||||
in a light comedic tone. You are also Australian
|
|
||||||
As this person write this blog as a markdown document.
|
|
||||||
The title for the blog is {self.inner_title}.
|
|
||||||
Do not output the title in the markdown.
|
|
||||||
The basis for the content of the blog is:
|
|
||||||
<blog>{self.content}</blog>
|
|
||||||
"""
|
|
||||||
|
|
||||||
def split_into_chunks(self, text, chunk_size=100):
|
# ---- Validate required env vars early ----
|
||||||
"""Split text into chunks of size chunk_size"""
|
|
||||||
words = re.findall(r"\S+", text)
|
|
||||||
|
|
||||||
chunks = []
|
|
||||||
current_chunk = []
|
|
||||||
word_count = 0
|
|
||||||
|
|
||||||
for word in words:
|
|
||||||
current_chunk.append(word)
|
|
||||||
word_count += 1
|
|
||||||
|
|
||||||
if word_count >= chunk_size:
|
|
||||||
chunks.append(" ".join(current_chunk))
|
|
||||||
current_chunk = []
|
|
||||||
word_count = 0
|
|
||||||
|
|
||||||
if current_chunk:
|
|
||||||
chunks.append(" ".join(current_chunk))
|
|
||||||
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
def generate_draft(self, model) -> str:
|
|
||||||
"""Generate a draft blog post using the specified model"""
|
|
||||||
|
|
||||||
def _generate():
|
|
||||||
# the idea behind this is to make the "creativity" random amongst the content creators
|
|
||||||
# contorlling temperature will allow cause the output to allow more "random" connections in sentences
|
|
||||||
# Controlling top_p will tighten or loosen the embedding connections made
|
|
||||||
# The result should be varied levels of "creativity" in the writing of the drafts
|
|
||||||
# for more see https://python.langchain.com/v0.2/api_reference/ollama/chat_models/langchain_ollama.chat_models.ChatOllama.html
|
|
||||||
temp = random.uniform(0.5, 1.0)
|
|
||||||
top_p = random.uniform(0.4, 0.8)
|
|
||||||
top_k = int(random.uniform(30, 80))
|
|
||||||
agent_llm = ChatOllama(
|
|
||||||
model=model, temperature=temp, top_p=top_p, top_k=top_k
|
|
||||||
)
|
|
||||||
messages = [
|
|
||||||
(
|
|
||||||
"system",
|
|
||||||
"You are a creative writer specialising in writing about technology",
|
|
||||||
),
|
|
||||||
("human", self.prompt_inject),
|
|
||||||
]
|
|
||||||
response = agent_llm.invoke(messages)
|
|
||||||
return (
|
|
||||||
response.text if hasattr(response, "text") else str(response)
|
|
||||||
) # ['message']['content']
|
|
||||||
|
|
||||||
# Retry mechanism with 30-minute timeout
|
|
||||||
timeout_seconds = 30 * 60 # 30 minutes
|
|
||||||
max_retries = 3
|
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
||||||
future = executor.submit(_generate)
|
|
||||||
result = future.result(timeout=timeout_seconds)
|
|
||||||
return result
|
|
||||||
except TimeoutError:
|
|
||||||
print(
|
|
||||||
f"AI call timed out after {timeout_seconds} seconds on attempt {attempt + 1}"
|
|
||||||
)
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
print("Retrying...")
|
|
||||||
time.sleep(5) # Wait 5 seconds before retrying
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
raise Exception(
|
|
||||||
f"AI call failed to complete after {max_retries} attempts with {timeout_seconds} second timeouts"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
print(f"Attempt {attempt + 1} failed with error: {e}. Retrying...")
|
|
||||||
time.sleep(5) # Wait 5 seconds before retrying
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
raise Exception(
|
|
||||||
f"Failed to generate blog draft after {max_retries} attempts: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_draft_embeddings(self, draft_chunks):
|
|
||||||
"""Get embeddings for the draft chunks"""
|
|
||||||
try:
|
try:
|
||||||
# Handle empty draft chunks
|
_ = json.loads(os.environ["CONTENT_CREATOR_MODELS"])
|
||||||
if not draft_chunks:
|
except (KeyError, json.JSONDecodeError) as exc:
|
||||||
print("Warning: No draft chunks to embed")
|
raise Exception(
|
||||||
return []
|
f"CONTENT_CREATOR_MODELS env var is missing or invalid: {exc}"
|
||||||
|
|
||||||
embeds = self.ollama_client.embed(
|
|
||||||
model=self.embed_model, input=draft_chunks
|
|
||||||
)
|
)
|
||||||
embeddings = embeds.get("embeddings", [])
|
|
||||||
|
|
||||||
# Check if embeddings were generated successfully
|
|
||||||
if not embeddings:
|
|
||||||
print("Warning: No embeddings generated")
|
|
||||||
return []
|
|
||||||
|
|
||||||
return embeddings
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error generating embeddings: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def id_generator(self, size=6, chars=string.ascii_uppercase + string.digits):
|
|
||||||
return "".join(random.choice(chars) for _ in range(size))
|
|
||||||
|
|
||||||
def load_to_vector_db(self):
|
|
||||||
"""Load the generated blog drafts into a vector database"""
|
|
||||||
collection_name = (
|
|
||||||
f"blog_{self.title.lower().replace(' ', '_')}_{self.id_generator()}"
|
|
||||||
)
|
|
||||||
collection = self.chroma.get_or_create_collection(
|
|
||||||
name=collection_name
|
|
||||||
) # , metadata={"hnsw:space": "cosine"})
|
|
||||||
# if any(collection.name == collectionname for collectionname in self.chroma.list_collections()):
|
|
||||||
# self.chroma.delete_collection("blog_creator")
|
|
||||||
for model in self.agent_models:
|
|
||||||
print(f"Generating draft from {model} for load into vector database")
|
|
||||||
try:
|
|
||||||
draft_content = self.generate_draft(model)
|
|
||||||
draft_chunks = self.split_into_chunks(draft_content)
|
|
||||||
|
|
||||||
# Skip if no content was generated
|
|
||||||
if not draft_chunks or all(
|
|
||||||
chunk.strip() == "" for chunk in draft_chunks
|
|
||||||
):
|
|
||||||
print(f"Skipping {model} - no content generated")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"generating embeds for {model}")
|
|
||||||
embeds = self.get_draft_embeddings(draft_chunks)
|
|
||||||
|
|
||||||
# Skip if no embeddings were generated
|
|
||||||
if not embeds:
|
|
||||||
print(f"Skipping {model} - no embeddings generated")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Ensure we have the same number of embeddings as chunks
|
|
||||||
if len(embeds) != len(draft_chunks):
|
|
||||||
print(
|
|
||||||
f"Warning: Mismatch between chunks ({len(draft_chunks)}) and embeddings ({len(embeds)}) for {model}"
|
|
||||||
)
|
|
||||||
# Truncate or pad to match
|
|
||||||
min_length = min(len(embeds), len(draft_chunks))
|
|
||||||
draft_chunks = draft_chunks[:min_length]
|
|
||||||
embeds = embeds[:min_length]
|
|
||||||
if min_length == 0:
|
|
||||||
print(f"Skipping {model} - no valid content/embeddings pairs")
|
|
||||||
continue
|
|
||||||
|
|
||||||
ids = [model + str(i) for i in range(len(draft_chunks))]
|
|
||||||
chunknumber = list(range(len(draft_chunks)))
|
|
||||||
metadata = [{"model_agent": model} for index in chunknumber]
|
|
||||||
print(f"loading into collection for {model}")
|
|
||||||
collection.add(
|
|
||||||
documents=draft_chunks,
|
|
||||||
embeddings=embeds,
|
|
||||||
ids=ids,
|
|
||||||
metadatas=metadata,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error processing model {model}: {e}")
|
|
||||||
# Continue with other models rather than failing completely
|
|
||||||
continue
|
|
||||||
|
|
||||||
return collection
|
|
||||||
|
|
||||||
def generate_markdown(self) -> str:
|
|
||||||
prompt_human = f"""
|
|
||||||
You are an editor taking information from {len(self.agent_models)} Software
|
|
||||||
Developers and Data experts
|
|
||||||
writing a 5000 word blog article. You like when they use almost no code examples.
|
|
||||||
You are also Australian. The content may have light comedic elements,
|
|
||||||
you are more professional and will attempt to tone these down
|
|
||||||
As this person produce the final version of this blog as a markdown document
|
|
||||||
keeping in mind the context provided by the previous drafts.
|
|
||||||
You are to produce the content not placeholders for further editors
|
|
||||||
The title for the blog is {self.inner_title}.
|
|
||||||
Do not output the title in the markdown. Avoid repeated sentences
|
|
||||||
The basis for the content of the blog is:
|
|
||||||
<blog>{self.content}</blog>
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _generate_final_document():
|
|
||||||
try:
|
|
||||||
embed_result = self.ollama_client.embed(
|
|
||||||
model=self.embed_model, input=prompt_human
|
|
||||||
)
|
|
||||||
query_embed = embed_result.get("embeddings", [])
|
|
||||||
if not query_embed:
|
|
||||||
print(
|
|
||||||
"Warning: Failed to generate query embeddings, using empty list"
|
|
||||||
)
|
|
||||||
query_embed = [[]] # Use a single empty embedding as fallback
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error generating query embeddings: {e}")
|
|
||||||
# Generate empty embeddings as fallback
|
|
||||||
query_embed = [[]] # Use a single empty embedding as fallback
|
|
||||||
|
|
||||||
collection = self.load_to_vector_db()
|
|
||||||
|
|
||||||
# Try to query the collection, with fallback for empty collections
|
|
||||||
try:
|
|
||||||
collection_query = collection.query(
|
|
||||||
query_embeddings=query_embed, n_results=100
|
|
||||||
)
|
|
||||||
print("Showing pertinent info from drafts used in final edited edition")
|
|
||||||
|
|
||||||
# Get documents with error handling
|
|
||||||
query_result = collection.query(
|
|
||||||
query_embeddings=query_embed, n_results=100
|
|
||||||
)
|
|
||||||
documents = query_result.get("documents", [])
|
|
||||||
|
|
||||||
if documents and len(documents) > 0 and len(documents[0]) > 0:
|
|
||||||
pertinent_draft_info = "\n\n".join(documents[0])
|
|
||||||
else:
|
|
||||||
print("Warning: No relevant documents found in collection")
|
|
||||||
pertinent_draft_info = "No relevant information found in drafts."
|
|
||||||
|
|
||||||
except Exception as query_error:
|
|
||||||
print(f"Error querying collection: {query_error}")
|
|
||||||
pertinent_draft_info = (
|
|
||||||
"No relevant information found in drafts due to query error."
|
|
||||||
)
|
|
||||||
# print(pertinent_draft_info)
|
|
||||||
prompt_system = f"""Generate the final, 5000 word, draft of the blog using this information from the drafts: <context>{pertinent_draft_info}</context>
|
|
||||||
- Only output in markdown, do not wrap in markdown tags, Only provide the draft not a commentary on the drafts in the context
|
|
||||||
"""
|
|
||||||
print("Generating final document")
|
|
||||||
messages = [
|
|
||||||
("system", prompt_system),
|
|
||||||
("human", prompt_human),
|
|
||||||
]
|
|
||||||
response = self.llm.invoke(messages)
|
|
||||||
return response.text if hasattr(response, "text") else str(response)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Retry mechanism with 30-minute timeout
|
_ = int(os.environ["CHROMA_PORT"])
|
||||||
timeout_seconds = 30 * 60 # 30 minutes
|
except (KeyError, ValueError) as exc:
|
||||||
max_retries = 3
|
raise Exception(f"CHROMA_PORT is not an integer: {exc}")
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
# ------------------------------------------------------------------
|
||||||
try:
|
# Public API
|
||||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
# ------------------------------------------------------------------
|
||||||
future = executor.submit(_generate_final_document)
|
|
||||||
self.response = future.result(timeout=timeout_seconds)
|
|
||||||
break # Success, exit the retry loop
|
|
||||||
except TimeoutError:
|
|
||||||
print(
|
|
||||||
f"AI call timed out after {timeout_seconds} seconds on attempt {attempt + 1}"
|
|
||||||
)
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
print("Retrying...")
|
|
||||||
time.sleep(5) # Wait 5 seconds before retrying
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
raise Exception(
|
|
||||||
f"AI call failed to complete after {max_retries} attempts with {timeout_seconds} second timeouts"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
print(
|
|
||||||
f"Attempt {attempt + 1} failed with error: {e}. Retrying..."
|
|
||||||
)
|
|
||||||
time.sleep(5) # Wait 5 seconds before retrying
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
raise Exception(
|
|
||||||
f"Failed to generate markdown after {max_retries} attempts: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# self.response = self.ollama_client.chat(model=self.ollama_model,
|
|
||||||
# messages=[
|
|
||||||
# 'content': f'{prompt_enhanced}',
|
|
||||||
# },
|
|
||||||
# ])
|
|
||||||
# print ("Markdown Generated")
|
|
||||||
# print (self.response)
|
|
||||||
return self.response # ['message']['content']
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise Exception(f"Failed to generate markdown: {e}")
|
|
||||||
|
|
||||||
def save_to_file(self, filename: str) -> None:
|
def save_to_file(self, filename: str) -> None:
|
||||||
|
"""Run the full CrewAI blog-generation flow and write the result
|
||||||
|
to *filename*.
|
||||||
|
|
||||||
|
After this call ``self.response`` contains the final markdown text.
|
||||||
|
"""
|
||||||
|
self.response = self.generate_markdown()
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
f.write(self.generate_markdown())
|
f.write(self.response)
|
||||||
|
|
||||||
def generate_system_message(self, prompt_system, prompt_human):
|
def generate_markdown(self) -> str:
|
||||||
def _generate():
|
"""Execute the CrewAI Flow and return the final markdown document.
|
||||||
messages = [
|
|
||||||
("system", prompt_system),
|
|
||||||
("human", prompt_human),
|
|
||||||
]
|
|
||||||
response = self.llm.invoke(messages)
|
|
||||||
ai_message = response.text if hasattr(response, "text") else str(response)
|
|
||||||
return ai_message
|
|
||||||
|
|
||||||
# Retry mechanism with 30-minute timeout
|
The Flow:
|
||||||
timeout_seconds = 30 * 60 # 30 minutes
|
1. **Research crew** – a critical researcher with web search
|
||||||
|
investigates the topic and produces verified findings.
|
||||||
|
2. **Writing crew** – four creative journalists write draft
|
||||||
|
blog articles in parallel.
|
||||||
|
3. **Editor crew** – a critical editor loads the journalist drafts
|
||||||
|
into the vector DB, queries for relevant context, and produces
|
||||||
|
the polished final document.
|
||||||
|
"""
|
||||||
|
inputs = {
|
||||||
|
"title": self.title,
|
||||||
|
"inner_title": self.inner_title,
|
||||||
|
"content": self.content,
|
||||||
|
}
|
||||||
|
if self.date is not None:
|
||||||
|
inputs["date"] = self.date
|
||||||
|
if self.category is not None:
|
||||||
|
inputs["category"] = self.category
|
||||||
|
|
||||||
|
flow = BlogFlow()
|
||||||
|
result = flow.kickoff(inputs=inputs)
|
||||||
|
return str(result)
|
||||||
|
|
||||||
|
def generate_system_message(self, prompt_system: str, prompt_human: str) -> str:
|
||||||
|
"""Send a system/human message pair to the editor model and return
|
||||||
|
the assistant's response.
|
||||||
|
|
||||||
|
This is a lightweight helper used by ``main.py`` for generating
|
||||||
|
commit messages and notification text – it does **not** invoke the
|
||||||
|
full CrewAI Flow.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _generate() -> str:
|
||||||
|
response = self.ollama_client.chat(
|
||||||
|
model=self.ollama_model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": prompt_system},
|
||||||
|
{"role": "user", "content": prompt_human},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
return response["message"]["content"]
|
||||||
|
|
||||||
|
# Retry mechanism with 30-minute timeout (same as the original)
|
||||||
|
timeout_seconds = 30 * 60
|
||||||
max_retries = 3
|
max_retries = 3
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
@ -357,22 +152,30 @@ class OllamaGenerator:
|
|||||||
return result
|
return result
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
print(
|
print(
|
||||||
f"AI call timed out after {timeout_seconds} seconds on attempt {attempt + 1}"
|
f"AI call timed out after {timeout_seconds} seconds "
|
||||||
|
f"on attempt {attempt + 1}"
|
||||||
)
|
)
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
print("Retrying...")
|
print("Retrying...")
|
||||||
time.sleep(5) # Wait 5 seconds before retrying
|
time.sleep(5)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"AI call failed to complete after {max_retries} attempts with {timeout_seconds} second timeouts"
|
f"AI call failed to complete after {max_retries} "
|
||||||
|
f"attempts with {timeout_seconds} second timeouts"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as exc:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
print(f"Attempt {attempt + 1} failed with error: {e}. Retrying...")
|
print(
|
||||||
time.sleep(5) # Wait 5 seconds before retrying
|
f"Attempt {attempt + 1} failed with error: {exc}. Retrying..."
|
||||||
|
)
|
||||||
|
time.sleep(5)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Failed to generate system message after {max_retries} attempts: {e}"
|
f"Failed to generate system message after "
|
||||||
|
f"{max_retries} attempts: {exc}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Should never reach here, but satisfy type-checkers
|
||||||
|
raise RuntimeError("Unexpected exit from generate_system_message")
|
||||||
|
|||||||
4
src/ai_generators/tools/__init__.py
Normal file
4
src/ai_generators/tools/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Tools package for the blog generation CrewAI flow.
|
||||||
|
from ai_generators.tools.ollama_web_search_tool import OllamaWebSearchTool
|
||||||
|
|
||||||
|
__all__ = ["OllamaWebSearchTool"]
|
||||||
124
src/ai_generators/tools/ollama_web_search_tool.py
Normal file
124
src/ai_generators/tools/ollama_web_search_tool.py
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
"""
|
||||||
|
Custom CrewAI tool that wraps Ollama's native web search API.
|
||||||
|
|
||||||
|
This tool allows CrewAI agents to perform web searches using an Ollama
|
||||||
|
subscription instead of third-party services like Serper or EXA.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- Ollama Python library: pip install ollama
|
||||||
|
- OLLAMA_API_KEY environment variable set with your Ollama API key
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import ollama
|
||||||
|
from crewai.tools import BaseTool
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaWebSearchInput(BaseModel):
|
||||||
|
"""Input schema for OllamaWebSearchTool."""
|
||||||
|
|
||||||
|
query: str = Field(
|
||||||
|
...,
|
||||||
|
description="The web search query string. Be specific and include relevant keywords.",
|
||||||
|
)
|
||||||
|
max_results: int = Field(
|
||||||
|
default=5,
|
||||||
|
ge=1,
|
||||||
|
le=10,
|
||||||
|
description="Maximum number of search results to return (1-10, default 5).",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaWebSearchTool(BaseTool):
|
||||||
|
"""
|
||||||
|
Web search tool using Ollama's native web search API.
|
||||||
|
|
||||||
|
This tool performs live web searches and returns relevant results with
|
||||||
|
titles, URLs, and content snippets. It's ideal for research tasks that
|
||||||
|
require current, up-to-date information from the internet.
|
||||||
|
|
||||||
|
The tool requires an Ollama subscription and the OLLAMA_API_KEY environment
|
||||||
|
variable to be set.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
from ai_generators.tools.ollama_web_search_tool import OllamaWebSearchTool
|
||||||
|
|
||||||
|
researcher = Agent(
|
||||||
|
role="Researcher",
|
||||||
|
goal="Research topics thoroughly",
|
||||||
|
tools=[OllamaWebSearchTool()],
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "ollama_web_search"
|
||||||
|
description: str = (
|
||||||
|
"Search the web for current information using Ollama's web search API. "
|
||||||
|
"Use this tool when you need to find up-to-date information, verify claims, "
|
||||||
|
"find supporting or contradicting evidence, or research topics that require "
|
||||||
|
"current data. Returns search results with titles, URLs, and content snippets."
|
||||||
|
)
|
||||||
|
args_schema: type[BaseModel] = OllamaWebSearchInput
|
||||||
|
|
||||||
|
def _run(self, query: str, max_results: int = 5) -> str:
|
||||||
|
"""
|
||||||
|
Execute a web search and return formatted results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: The search query string
|
||||||
|
max_results: Maximum number of results to return (1-10)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted string with search results, each containing title, URL, and content
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Ensure API key is set
|
||||||
|
if not os.environ.get("OLLAMA_API_KEY"):
|
||||||
|
return "Error: OLLAMA_API_KEY environment variable is not set. Please set your Ollama API key."
|
||||||
|
|
||||||
|
# Perform the web search
|
||||||
|
response = ollama.web_search(query=query, max_results=max_results)
|
||||||
|
|
||||||
|
# Extract and format results
|
||||||
|
results = response.get("results", [])
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return f"No search results found for query: '{query}'"
|
||||||
|
|
||||||
|
formatted_results = []
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
title = result.get("title", "No title")
|
||||||
|
url = result.get("url", "No URL")
|
||||||
|
content = result.get("content", "No content available")
|
||||||
|
|
||||||
|
formatted_results.append(
|
||||||
|
f"Result {i}:\nTitle: {title}\nURL: {url}\nContent: {content}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(formatted_results)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
return f"Error performing web search: {exc}"
|
||||||
|
|
||||||
|
def _handle_exception(self, exc: Exception) -> str:
|
||||||
|
"""Handle exceptions gracefully and return a user-friendly error message."""
|
||||||
|
error_message = str(exc)
|
||||||
|
|
||||||
|
# Check for common error types
|
||||||
|
if "authentication" in error_message.lower() or "401" in error_message:
|
||||||
|
return (
|
||||||
|
"Authentication error: Your OLLAMA_API_KEY may be invalid or expired. "
|
||||||
|
"Please check your API key and ensure it's set correctly in the environment."
|
||||||
|
)
|
||||||
|
elif "rate limit" in error_message.lower() or "429" in error_message:
|
||||||
|
return "Rate limit exceeded: Too many search requests. Please wait a moment and try again."
|
||||||
|
elif (
|
||||||
|
"network" in error_message.lower() or "connection" in error_message.lower()
|
||||||
|
):
|
||||||
|
return (
|
||||||
|
"Network error: Unable to connect to Ollama's web search service. "
|
||||||
|
"Please check your internet connection and try again."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return f"Search failed: {error_message}"
|
||||||
Loading…
x
Reference in New Issue
Block a user