test local text-to-image models and cloudflare API

2025-10-18 16:06:54 +03:00
parent 2cf0a9f711
commit ae497eac6e
10 changed files with 216 additions and 21 deletions
--- a/3d-generation-pipeline/.gitignore
+++ b/3d-generation-pipeline/.gitignore
@@ -1 +1,2 @@
-.venv
+.venv
 .env
--- a/3d-generation-pipeline/cloudflare_API_test.ipynb
+++ b/3d-generation-pipeline/cloudflare_API_test.ipynb
@@ -0,0 +1,172 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1dc6faae",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import base64\n",
    "import requests\n",
    "from dotenv import load_dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3107275",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv()\n",
    "\n",
    "ACCOUNT_ID = os.environ[\"CF_ACCOUNT_ID\"]\n",
    "API_TOKEN = os.environ[\"CF_API_TOKEN\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "999adf95",
   "metadata": {},
   "source": [
    "## Text to image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40b35163",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved: output.jpg  (263282 bytes)\n",
      "Saved: image9.jpg  (263282 bytes)\n"
     ]
    }
   ],
   "source": [
    "MODEL = \"@cf/black-forest-labs/flux-1-schnell\"\n",
    "URL = f\"https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/{MODEL}\"\n",
    "\n",
    "payload = {\n",
    "    \"prompt\": \"a slightly curved broadsword with a fancy golden crossguard\",\n",
    "}\n",
    "\n",
    "headers = {\n",
    "    \"Authorization\": f\"Bearer {API_TOKEN}\",\n",
    "    \"Content-Type\": \"application/json\",\n",
    "}\n",
    "\n",
    "resp = requests.post(URL, json=payload, headers=headers, timeout=60)\n",
    "resp.raise_for_status()\n",
    "\n",
    "data = resp.json()\n",
    "b64 = data[\"result\"][\"image\"]\n",
    "if not b64:\n",
    "    raise RuntimeError(f\"Unexpected response structure: {data}\")\n",
    "\n",
    "img_bytes = base64.b64decode(b64)\n",
    "\n",
    "out_path = \"output.jpg\"\n",
    "with open(out_path, \"wb\") as f:\n",
    "    f.write(img_bytes)\n",
    "\n",
    "print(f\"Saved: {out_path}  ({len(img_bytes)} bytes)\")\n",
    "\n",
    "b64 = data[\"result\"][\"image\"]\n",
    "if not b64:\n",
    "    raise RuntimeError(f\"Unexpected response structure: {data}\")\n",
    "\n",
    "img_bytes = base64.b64decode(b64)\n",
    "\n",
    "out_path = \"image9.jpg\"\n",
    "with open(out_path, \"wb\") as f:\n",
    "    f.write(img_bytes)\n",
    "\n",
    "print(f\"Saved: {out_path}  ({len(img_bytes)} bytes)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14a874c4",
   "metadata": {},
   "source": [
    "## Text prompt refinement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "485f6f46",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"dark wooden battleaxe with bronze blade\"\n"
     ]
    }
   ],
   "source": [
    "MODEL = \"@cf/meta/llama-3.2-3b-instruct\"\n",
    "URL = f\"https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/{MODEL}\"\n",
    "\n",
    "instructions = \"\"\"\n",
    "User is talking about some object. Your task is to generate a short and concise description of it. Use only user's own words, keep it as short as possible.\n",
    "Example:\n",
    "User: 'Umm, okay, I would like a really cool sword, with for example a bright orange crossguard. And also it should be slightly curved.'\n",
    "You: 'a slightly curved sword with bright orange crossguard'\n",
    "\"\"\"\n",
    "prompt = \"Umm, alright, can you please give me an epic battleaxe? It should have a dark wooden shaft and bronze blade.\"\n",
    "\n",
    "response = requests.post(URL,\n",
    "  headers={\"Authorization\": f\"Bearer {API_TOKEN}\"},\n",
    "  json={\n",
    "    \"messages\": [\n",
    "      {\"role\": \"system\", \"content\": instructions},\n",
    "      {\"role\": \"user\", \"content\": prompt}\n",
    "    ]\n",
    "  }\n",
    ")\n",
    "data = response.json()\n",
    "result_text = data[\"result\"][\"response\"]\n",
    "print(result_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76fa21f0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/3d-generation-pipeline/generate_image.py
+++ b/3d-generation-pipeline/generate_image.py
@@ -1,17 +0,0 @@
 import torch
 from diffusers import StableDiffusion3Pipeline
 model_name = "stabilityai/stable-diffusion-3.5-medium"
 pipe = StableDiffusion3Pipeline.from_pretrained(model_name, use_safetensors=True, variant="fp16")
 pipe = pipe.to("cuda")
 prompt = "A cute cat eating a slice of pizza, stunning color scheme, masterpiece, illustration"
 image = pipe(
    prompt,
    guidance_scale=3.0,
    generator=torch.Generator("cuda")
 ).images[0]
 image_name = "image.png"
 image.save(image_name)
--- a/3d-generation-pipeline/generate_image_local.py
+++ b/3d-generation-pipeline/generate_image_local.py
@@ -0,0 +1,28 @@
 import torch
 from diffusers import StableDiffusionPipeline, StableDiffusion3Pipeline
 import time
 start_timestamp = time.time()
 #model = "stabilityai/stable-diffusion-3.5-medium" # generation time: 13 min
 model = "stabilityai/stable-diffusion-3-medium-diffusers" # generation time: 10 min
 #model = "stabilityai/stable-diffusion-2" # generation time: 4 sec
 pipe = StableDiffusion3Pipeline.from_pretrained(model, torch_dtype=torch.float16)
 #pipe = StableDiffusionPipeline.from_pretrained(model, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 model_loaded_timestamp = time.time()
 model_load_time = model_loaded_timestamp - start_timestamp
 print(f"model load time: {round(model_load_time)} seconds")
 prompt = "A majestic broadsword with a golden pommel, no background"
 image = pipe(
    prompt,
    guidance_scale=3.0,
 ).images[0]
 image_name = "image7.png"
 image.save(f"images/{image_name}")
 generation_time = time.time() - model_loaded_timestamp
 print(f"image generation time: {round(generation_time)} seconds")
--- a/3d-generation-pipeline/images/flux-1-schnell.jpg
+++ b/3d-generation-pipeline/images/flux-1-schnell.jpg
--- a/3d-generation-pipeline/images/stable-diffusion-2.png
+++ b/3d-generation-pipeline/images/stable-diffusion-2.png
--- a/3d-generation-pipeline/images/stable-diffusion-3-5-medium.png
+++ b/3d-generation-pipeline/images/stable-diffusion-3-5-medium.png
--- a/3d-generation-pipeline/images/stable-diffusion-3-medium.png
+++ b/3d-generation-pipeline/images/stable-diffusion-3-medium.png
--- a/3d-generation-pipeline/local_model_test.ipynb
+++ b/3d-generation-pipeline/local_model_test.ipynb
--- a/3d-generation-pipeline/requirements.txt
+++ b/3d-generation-pipeline/requirements.txt
@@ -1,7 +1,6 @@
-torch==2.8.0+cu129
+#torch==2.8.0+cu129 https://pytorch.org/get-started/previous-versions/
 transformers==4.57.0
-#diffusers==0.35.1
+git+https://github.com/huggingface/diffusers.git
 it+https://github.com/huggingface/diffusers.git
 accelerate==1.10.1
 huggingface_hub[hf_xet]==1.1.10
 sentencepiece==0.2.1