GLM-5.2 OpenAI-Compatible API: A Hands-On Guide to Reasoning Effort, Function Calling, and Long-Context Retrieval

GLM-5.2 OpenAI-Compatible API: A Hands-On Guide to Reasoning Effort, Function Calling, and Long-Context Retrieval


import sys, subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-U", "openai"], check=False)
import os, re, json, time, getpass
from openai import OpenAI
PROVIDERS = {
   "zai":         {"base_url": "https://api.z.ai/api/paas/v4/",   "model": "glm-5.2",        "env": "ZAI_API_KEY"},
   "openrouter":  {"base_url": "https://openrouter.ai/api/v1",    "model": "z-ai/glm-5.2",   "env": "OPENROUTER_API_KEY"},
   "together":    {"base_url": "https://api.together.xyz/v1",     "model": "zai-org/GLM-5.2","env": "TOGETHER_API_KEY"},
   "requesty":    {"base_url": "https://router.requesty.ai/v1",   "model": "zai/glm-5.2",    "env": "REQUESTY_API_KEY"},
   "huggingface": {"base_url": "https://router.huggingface.co/v1","model": "zai-org/GLM-5.2","env": "HF_TOKEN"},
}
PROVIDER = "zai"
CFG   = PROVIDERS[PROVIDER]
MODEL = CFG["model"]
def load_api_key(env_name):
   try:
       from google.colab import userdata
       v = userdata.get(env_name)
       if v: return v
   except Exception:
       pass
   if os.environ.get(env_name):
       return os.environ[env_name]
   return getpass.getpass(f"Enter your {env_name}: ")
client = OpenAI(api_key=load_api_key(CFG["env"]), base_url=CFG["base_url"])
PRICE_IN_PER_M, PRICE_OUT_PER_M = 1.40, 4.40
_USAGE = {"in": 0, "out": 0, "calls": 0}
def _track(usage):
   if usage:
       _USAGE["in"]    += getattr(usage, "prompt_tokens", 0) or 0
       _USAGE["out"]   += getattr(usage, "completion_tokens", 0) or 0
       _USAGE["calls"] += 1
def get_reasoning(obj):
   """Pull GLM's hidden reasoning trace from a message/delta (a provider-extra field)."""
   val = getattr(obj, "reasoning_content", None)
   if val: return val
   extra = getattr(obj, "model_extra", None) or {}
   if extra.get("reasoning_content"): return extra["reasoning_content"]
   try:    return obj.to_dict().get("reasoning_content")
   except Exception: return None
def chat(messages, effort=None, thinking=True, tools=None, tool_choice="auto",
        stream=False, max_tokens=2048, temperature=1.0, tool_stream=False):
   """
   effort:   None | "high" | "max"   (GLM-5.2 thinking-effort level; max is the model default)
   thinking: True -> deep thinking on; False -> off (fast, cheap, low-latency)
   GLM-specific params go through extra_body so any OpenAI client works.
   """
   extra = {"thinking": {"type": "enabled" if thinking else "disabled"}}
   if effort and thinking: extra["reasoning_effort"] = effort
   if tool_stream:         extra["tool_stream"] = True
   kwargs = dict(model=MODEL, messages=messages, max_tokens=max_tokens,
                 temperature=temperature, stream=stream, extra_body=extra)
   if tools:
       kwargs.update(tools=tools, tool_choice=tool_choice)
   if stream:
       kwargs["stream_options"] = {"include_usage": True}
   return client.chat.completions.create(**kwargs)



Source link