Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions src/api/providers/lm-studio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@ import { NativeToolCallParser } from "../../core/assistant-message/NativeToolCal
import { TagMatcher } from "../../utils/tag-matcher"

import { convertToOpenAiMessages } from "../transform/openai-format"
import { convertToZAiFormat } from "../transform/zai-format"
import { ApiStream } from "../transform/stream"

import { BaseProvider } from "./base-provider"
import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
import { getModelsFromCache } from "./fetchers/modelCache"
import { getApiRequestTimeout } from "./utils/timeout-config"
import { handleOpenAIError } from "./utils/openai-error-handler"
import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection"

export class LmStudioHandler extends BaseProvider implements SingleCompletionHandler {
protected options: ApiHandlerOptions
private client: OpenAI
private readonly providerName = "LM Studio"
private glmConfig: GlmModelConfig

constructor(options: ApiHandlerOptions) {
super()
Expand All @@ -35,17 +38,32 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
apiKey: apiKey,
timeout: getApiRequestTimeout(),
})

// Detect if this is a GLM model and apply optimizations
this.glmConfig = detectGlmModel(this.options.lmStudioModelId)
if (this.options.lmStudioModelId) {
logGlmDetection(this.providerName, this.options.lmStudioModelId, this.glmConfig)
}
}

override async *createMessage(
systemPrompt: string,
messages: Anthropic.Messages.MessageParam[],
metadata?: ApiHandlerCreateMessageMetadata,
): ApiStream {
const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
{ role: "system", content: systemPrompt },
...convertToOpenAiMessages(messages),
]
// For GLM models, use Z.ai format with mergeToolResultText to prevent conversation flow disruption
// For other models, use standard OpenAI format
let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[]
if (this.glmConfig.isGlm && this.glmConfig.mergeToolResultText) {
// Use Z.ai format converter which merges text after tool results into tool messages
const convertedMessages = convertToZAiFormat(messages, { mergeToolResultText: true })
openAiMessages = [{ role: "system", content: systemPrompt }, ...convertedMessages]
} else {
openAiMessages = [
{ role: "system", content: systemPrompt },
...convertToOpenAiMessages(messages),
]
}

// -------------------------
// Track token usage
Expand Down Expand Up @@ -83,14 +101,27 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
let assistantText = ""

try {
// Determine temperature: use GLM default (0.6) for GLM models, otherwise LM Studio default (0)
const temperature = this.options.modelTemperature ??
(this.glmConfig.isGlm ? this.glmConfig.temperature : LMSTUDIO_DEFAULT_TEMPERATURE)

// For GLM models, disable parallel_tool_calls as GLM models may not support it
const parallelToolCalls = this.glmConfig.isGlm && this.glmConfig.disableParallelToolCalls
? false
: (metadata?.parallelToolCalls ?? true)

if (this.glmConfig.isGlm && this.glmConfig.disableParallelToolCalls) {
console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`)
}

const params: OpenAI.Chat.ChatCompletionCreateParamsStreaming & { draft_model?: string } = {
model: this.getModel().id,
messages: openAiMessages,
temperature: this.options.modelTemperature ?? LMSTUDIO_DEFAULT_TEMPERATURE,
temperature,
stream: true,
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: parallelToolCalls,
}

if (this.options.lmStudioSpeculativeDecodingEnabled && this.options.lmStudioDraftModelId) {
Expand Down Expand Up @@ -187,11 +218,15 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan

async completePrompt(prompt: string): Promise<string> {
try {
// Determine temperature: use GLM default (0.6) for GLM models, otherwise LM Studio default (0)
const temperature = this.options.modelTemperature ??
(this.glmConfig.isGlm ? this.glmConfig.temperature : LMSTUDIO_DEFAULT_TEMPERATURE)

// Create params object with optional draft model
const params: any = {
model: this.getModel().id,
messages: [{ role: "user", content: prompt }],
temperature: this.options.modelTemperature ?? LMSTUDIO_DEFAULT_TEMPERATURE,
temperature,
stream: false,
}

Expand Down
63 changes: 56 additions & 7 deletions src/api/providers/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import { TagMatcher } from "../../utils/tag-matcher"

import { convertToOpenAiMessages } from "../transform/openai-format"
import { convertToR1Format } from "../transform/r1-format"
import { convertToZAiFormat } from "../transform/zai-format"
import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
import { getModelParams } from "../transform/model-params"

Expand All @@ -24,14 +25,16 @@ import { BaseProvider } from "./base-provider"
import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
import { getApiRequestTimeout } from "./utils/timeout-config"
import { handleOpenAIError } from "./utils/openai-error-handler"
import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection"

// TODO: Rename this to OpenAICompatibleHandler. Also, I think the
// `OpenAINativeHandler` can subclass from this, since it's obviously
// compatible with the OpenAI API. We can also rename it to `OpenAIHandler`.
export class OpenAiHandler extends BaseProvider implements SingleCompletionHandler {
protected options: ApiHandlerOptions
protected client: OpenAI
private readonly providerName = "OpenAI"
private readonly providerName = "OpenAI Compatible"
private glmConfig: GlmModelConfig

constructor(options: ApiHandlerOptions) {
super()
Expand Down Expand Up @@ -77,6 +80,12 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
timeout,
})
}

// Detect if this is a GLM model and apply optimizations
this.glmConfig = detectGlmModel(this.options.openAiModelId)
if (this.options.openAiModelId) {
logGlmDetection(this.providerName, this.options.openAiModelId, this.glmConfig)
}
}

override async *createMessage(
Expand Down Expand Up @@ -106,6 +115,10 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl

if (deepseekReasoner) {
convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
} else if (this.glmConfig.isGlm && this.glmConfig.mergeToolResultText) {
// For GLM models, use Z.ai format with mergeToolResultText to prevent conversation flow disruption
const zaiMessages = convertToZAiFormat(messages, { mergeToolResultText: true })
convertedMessages = [systemMessage, ...zaiMessages]
} else {
if (modelInfo.supportsPromptCache) {
systemMessage = {
Expand Down Expand Up @@ -152,16 +165,37 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl

const isGrokXAI = this._isGrokXAI(this.options.openAiBaseUrl)

// Determine temperature: use GLM default (0.6) for GLM models, DeepSeek default for DeepSeek, otherwise 0
let temperature: number | undefined
if (this.options.modelTemperature !== undefined) {
temperature = this.options.modelTemperature
} else if (this.glmConfig.isGlm) {
temperature = this.glmConfig.temperature
} else if (deepseekReasoner) {
temperature = DEEP_SEEK_DEFAULT_TEMPERATURE
} else {
temperature = 0
}

// For GLM models, disable parallel_tool_calls as GLM models may not support it
const parallelToolCalls = this.glmConfig.isGlm && this.glmConfig.disableParallelToolCalls
? false
: (metadata?.parallelToolCalls ?? true)

if (this.glmConfig.isGlm && this.glmConfig.disableParallelToolCalls) {
console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`)
}

const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
model: modelId,
temperature: this.options.modelTemperature ?? (deepseekReasoner ? DEEP_SEEK_DEFAULT_TEMPERATURE : 0),
temperature,
messages: convertedMessages,
stream: true as const,
...(isGrokXAI ? {} : { stream_options: { include_usage: true } }),
...(reasoning && reasoning),
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: parallelToolCalls,
}

// Add max_tokens if needed
Expand Down Expand Up @@ -221,15 +255,30 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
yield this.processUsageMetrics(lastUsage, modelInfo)
}
} else {
// Non-streaming: also apply GLM-specific settings
let nonStreamingMessages
if (deepseekReasoner) {
nonStreamingMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
} else if (this.glmConfig.isGlm && this.glmConfig.mergeToolResultText) {
// For GLM models, use Z.ai format with mergeToolResultText
const zaiMessages = convertToZAiFormat(messages, { mergeToolResultText: true })
nonStreamingMessages = [systemMessage, ...zaiMessages]
} else {
nonStreamingMessages = [systemMessage, ...convertToOpenAiMessages(messages)]
}

// For GLM models, disable parallel_tool_calls
const nonStreamingParallelToolCalls = this.glmConfig.isGlm && this.glmConfig.disableParallelToolCalls
? false
: (metadata?.parallelToolCalls ?? true)

const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
model: modelId,
messages: deepseekReasoner
? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
: [systemMessage, ...convertToOpenAiMessages(messages)],
messages: nonStreamingMessages,
// Tools are always present (minimum ALWAYS_AVAILABLE_TOOLS)
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: nonStreamingParallelToolCalls,
}
Comment on lines 275 to 282
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When streaming is disabled, the GLM temperature setting (0.6) is not applied here. The streaming path correctly calculates and applies temperature (lines 168-178), but this non-streaming path is missing it. This will cause inconsistent behavior where GLM models get temperature=0.6 only when streaming is enabled.

Suggested change
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
model: modelId,
messages: deepseekReasoner
? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
: [systemMessage, ...convertToOpenAiMessages(messages)],
messages: nonStreamingMessages,
// Tools are always present (minimum ALWAYS_AVAILABLE_TOOLS)
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: nonStreamingParallelToolCalls,
}
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
model: modelId,
temperature: this.options.modelTemperature ?? (this.glmConfig.isGlm ? this.glmConfig.temperature : (deepseekReasoner ? DEEP_SEEK_DEFAULT_TEMPERATURE : 0)),
messages: nonStreamingMessages,
// Tools are always present (minimum ALWAYS_AVAILABLE_TOOLS)
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: nonStreamingParallelToolCalls,
}

Fix it with Roo Code or mention @roomote and request a fix.


// Add max_tokens if needed
Expand Down
Loading
Loading