fix: Upgrade Hugging Face Inference API to support Inference Providers (#5454)

- Upgrade @huggingface/inference from v2.6.1 to v4.13.2 - Update ChatHuggingFace to use InferenceClient with chatCompletion API - Update HuggingFaceInference (LLM) to use v4 HfInference with Inference Providers - Update HuggingFaceInferenceEmbedding to use v4 HfInference - Add endpoint handling logic to ignore custom endpoints for provider-based models - Add improved error handling and validation for API keys - Update UI descriptions to guide users on proper configuration Fixes #5161 Co-authored-by: Henry <hzj94@hotmail.com>
2025-11-25 17:13:36 +05:30 · 2025-11-25 17:13:36 +05:30 · 0cc7b3036e
parent 097404f24a
commit 0cc7b3036e
9 changed files with 192 additions and 82 deletions
--- a/packages/components/nodes/agentflow/Agent/Agent.ts
+++ b/packages/components/nodes/agentflow/Agent/Agent.ts
@ -1569,16 +1569,20 @@ class Agent_Agentflow implements INode {
            for await (const chunk of await llmNodeInstance.stream(messages, { signal: abortController?.signal })) {
                if (sseStreamer) {
                    let content = ''
-                    if (Array.isArray(chunk.content) && chunk.content.length > 0) {
+
+                    if (typeof chunk === 'string') {
+                        content = chunk
+                    } else if (Array.isArray(chunk.content) && chunk.content.length > 0) {
                        const contents = chunk.content as MessageContentText[]
                        content = contents.map((item) => item.text).join('')
-                    } else {
+                    } else if (chunk.content) {
                        content = chunk.content.toString()
                    }
                    sseStreamer.streamTokenEvent(chatId, content)
                }

-                response = response.concat(chunk)
+                const messageChunk = typeof chunk === 'string' ? new AIMessageChunk(chunk) : chunk
+                response = response.concat(messageChunk)
            }
        } catch (error) {
            console.error('Error during streaming:', error)
--- a/packages/components/nodes/agentflow/HumanInput/HumanInput.ts
+++ b/packages/components/nodes/agentflow/HumanInput/HumanInput.ts
@ -241,8 +241,11 @@ class HumanInput_Agentflow implements INode {
                    if (isStreamable) {
                        const sseStreamer: IServerSideEventStreamer = options.sseStreamer as IServerSideEventStreamer
                        for await (const chunk of await llmNodeInstance.stream(messages)) {
-                            sseStreamer.streamTokenEvent(chatId, chunk.content.toString())
-                            response = response.concat(chunk)
+                            const content = typeof chunk === 'string' ? chunk : chunk.content.toString()
+                            sseStreamer.streamTokenEvent(chatId, content)
+
+                            const messageChunk = typeof chunk === 'string' ? new AIMessageChunk(chunk) : chunk
+                            response = response.concat(messageChunk)
                        }
                        humanInputDescription = response.content as string
                    } else {
--- a/packages/components/nodes/agentflow/LLM/LLM.ts
+++ b/packages/components/nodes/agentflow/LLM/LLM.ts
@ -824,16 +824,20 @@ class LLM_Agentflow implements INode {
            for await (const chunk of await llmNodeInstance.stream(messages, { signal: abortController?.signal })) {
                if (sseStreamer) {
                    let content = ''
-                    if (Array.isArray(chunk.content) && chunk.content.length > 0) {
+
+                    if (typeof chunk === 'string') {
+                        content = chunk
+                    } else if (Array.isArray(chunk.content) && chunk.content.length > 0) {
                        const contents = chunk.content as MessageContentText[]
                        content = contents.map((item) => item.text).join('')
-                    } else {
+                    } else if (chunk.content) {
                        content = chunk.content.toString()
                    }
                    sseStreamer.streamTokenEvent(chatId, content)
                }

-                response = response.concat(chunk)
+                const messageChunk = typeof chunk === 'string' ? new AIMessageChunk(chunk) : chunk
+                response = response.concat(messageChunk)
            }
        } catch (error) {
            console.error('Error during streaming:', error)
--- a/packages/components/nodes/chatmodels/ChatHuggingFace/ChatHuggingFace.ts
+++ b/packages/components/nodes/chatmodels/ChatHuggingFace/ChatHuggingFace.ts
@ -41,15 +41,17 @@ class ChatHuggingFace_ChatModels implements INode {
                label: 'Model',
                name: 'model',
                type: 'string',
-                description: 'If using own inference endpoint, leave this blank',
-                placeholder: 'gpt2'
+                description:
+                    'Model name (e.g., deepseek-ai/DeepSeek-V3.2-Exp:novita). If model includes provider (:) or using router endpoint, leave Endpoint blank.',
+                placeholder: 'deepseek-ai/DeepSeek-V3.2-Exp:novita'
            },
            {
                label: 'Endpoint',
                name: 'endpoint',
                type: 'string',
                placeholder: 'https://xyz.eu-west-1.aws.endpoints.huggingface.cloud/gpt2',
-                description: 'Using your own inference endpoint',
+                description:
+                    'Custom inference endpoint (optional). Not needed for models with providers (:) or router endpoints. Leave blank to use Inference Providers.',
                optional: true
            },
            {
@ -124,6 +126,15 @@ class ChatHuggingFace_ChatModels implements INode {
        const credentialData = await getCredentialData(nodeData.credential ?? '', options)
        const huggingFaceApiKey = getCredentialParam('huggingFaceApiKey', credentialData, nodeData)

+        if (!huggingFaceApiKey) {
+            console.error('[ChatHuggingFace] API key validation failed: No API key found')
+            throw new Error('HuggingFace API key is required. Please configure it in the credential settings.')
+        }
+
+        if (!huggingFaceApiKey.startsWith('hf_')) {
+            console.warn('[ChatHuggingFace] API key format warning: Key does not start with "hf_"')
+        }
+
        const obj: Partial<HFInput> = {
            model,
            apiKey: huggingFaceApiKey
--- a/packages/components/nodes/chatmodels/ChatHuggingFace/core.ts
+++ b/packages/components/nodes/chatmodels/ChatHuggingFace/core.ts
@ -56,9 +56,9 @@ export class HuggingFaceInference extends LLM implements HFInput {
        this.apiKey = fields?.apiKey ?? getEnvironmentVariable('HUGGINGFACEHUB_API_KEY')
        this.endpointUrl = fields?.endpointUrl
        this.includeCredentials = fields?.includeCredentials
-        if (!this.apiKey) {
+        if (!this.apiKey || this.apiKey.trim() === '') {
            throw new Error(
-                'Please set an API key for HuggingFace Hub in the environment variable HUGGINGFACEHUB_API_KEY or in the apiKey field of the HuggingFaceInference constructor.'
+                'Please set an API key for HuggingFace Hub. Either configure it in the credential settings in the UI, or set the environment variable HUGGINGFACEHUB_API_KEY.'
            )
        }
    }
@ -68,19 +68,21 @@ export class HuggingFaceInference extends LLM implements HFInput {
    }

    invocationParams(options?: this['ParsedCallOptions']) {
-        return {
-            model: this.model,
-            parameters: {
-                // make it behave similar to openai, returning only the generated text
-                return_full_text: false,
-                temperature: this.temperature,
-                max_new_tokens: this.maxTokens,
-                stop: options?.stop ?? this.stopSequences,
-                top_p: this.topP,
-                top_k: this.topK,
-                repetition_penalty: this.frequencyPenalty
-            }
+        // Return parameters compatible with chatCompletion API (OpenAI-compatible format)
+        const params: any = {
+            temperature: this.temperature,
+            max_tokens: this.maxTokens,
+            stop: options?.stop ?? this.stopSequences,
+            top_p: this.topP
        }
+        // Include optional parameters if they are defined
+        if (this.topK !== undefined) {
+            params.top_k = this.topK
+        }
+        if (this.frequencyPenalty !== undefined) {
+            params.frequency_penalty = this.frequencyPenalty
+        }
+        return params
    }

    async *_streamResponseChunks(
@ -88,51 +90,109 @@ export class HuggingFaceInference extends LLM implements HFInput {
        options: this['ParsedCallOptions'],
        runManager?: CallbackManagerForLLMRun
    ): AsyncGenerator<GenerationChunk> {
-        const hfi = await this._prepareHFInference()
-        const stream = await this.caller.call(async () =>
-            hfi.textGenerationStream({
-                ...this.invocationParams(options),
-                inputs: prompt
-            })
-        )
-        for await (const chunk of stream) {
-            const token = chunk.token.text
-            yield new GenerationChunk({ text: token, generationInfo: chunk })
-            await runManager?.handleLLMNewToken(token ?? '')
-
-            // stream is done
-            if (chunk.generated_text)
-                yield new GenerationChunk({
-                    text: '',
-                    generationInfo: { finished: true }
+        try {
+            const client = await this._prepareHFInference()
+            const stream = await this.caller.call(async () =>
+                client.chatCompletionStream({
+                    model: this.model,
+                    messages: [{ role: 'user', content: prompt }],
+                    ...this.invocationParams(options)
                })
+            )
+            for await (const chunk of stream) {
+                const token = chunk.choices[0]?.delta?.content || ''
+                if (token) {
+                    yield new GenerationChunk({ text: token, generationInfo: chunk })
+                    await runManager?.handleLLMNewToken(token)
+                }
+                // stream is done when finish_reason is set
+                if (chunk.choices[0]?.finish_reason) {
+                    yield new GenerationChunk({
+                        text: '',
+                        generationInfo: { finished: true }
+                    })
+                    break
+                }
+            }
+        } catch (error: any) {
+            console.error('[ChatHuggingFace] Error in _streamResponseChunks:', error)
+            // Provide more helpful error messages
+            if (error?.message?.includes('endpointUrl') || error?.message?.includes('third-party provider')) {
+                throw new Error(
+                    `Cannot use custom endpoint with model "${this.model}" that includes a provider. Please leave the Endpoint field blank in the UI. Original error: ${error.message}`
+                )
+            }
+            throw error
        }
    }

    /** @ignore */
    async _call(prompt: string, options: this['ParsedCallOptions']): Promise<string> {
-        const hfi = await this._prepareHFInference()
-        const args = { ...this.invocationParams(options), inputs: prompt }
-        const res = await this.caller.callWithOptions({ signal: options.signal }, hfi.textGeneration.bind(hfi), args)
-        return res.generated_text
+        try {
+            const client = await this._prepareHFInference()
+            // Use chatCompletion for chat models (v4 supports conversational models via Inference Providers)
+            const args = {
+                model: this.model,
+                messages: [{ role: 'user', content: prompt }],
+                ...this.invocationParams(options)
+            }
+            const res = await this.caller.callWithOptions({ signal: options.signal }, client.chatCompletion.bind(client), args)
+            const content = res.choices[0]?.message?.content || ''
+            if (!content) {
+                console.error('[ChatHuggingFace] No content in response:', JSON.stringify(res))
+                throw new Error(`No content received from HuggingFace API. Response: ${JSON.stringify(res)}`)
+            }
+            return content
+        } catch (error: any) {
+            console.error('[ChatHuggingFace] Error in _call:', error.message)
+            // Provide more helpful error messages
+            if (error?.message?.includes('endpointUrl') || error?.message?.includes('third-party provider')) {
+                throw new Error(
+                    `Cannot use custom endpoint with model "${this.model}" that includes a provider. Please leave the Endpoint field blank in the UI. Original error: ${error.message}`
+                )
+            }
+            if (error?.message?.includes('Invalid username or password') || error?.message?.includes('authentication')) {
+                throw new Error(
+                    `HuggingFace API authentication failed. Please verify your API key is correct and starts with "hf_". Original error: ${error.message}`
+                )
+            }
+            throw error
+        }
    }

    /** @ignore */
    private async _prepareHFInference() {
-        const { HfInference } = await HuggingFaceInference.imports()
-        const hfi = new HfInference(this.apiKey, {
-            includeCredentials: this.includeCredentials
-        })
-        return this.endpointUrl ? hfi.endpoint(this.endpointUrl) : hfi
+        if (!this.apiKey || this.apiKey.trim() === '') {
+            console.error('[ChatHuggingFace] API key validation failed: Empty or undefined')
+            throw new Error('HuggingFace API key is required. Please configure it in the credential settings.')
+        }
+
+        const { InferenceClient } = await HuggingFaceInference.imports()
+        // Use InferenceClient for chat models (works better with Inference Providers)
+        const client = new InferenceClient(this.apiKey)
+
+        // Don't override endpoint if model uses a provider (contains ':') or if endpoint is router-based
+        // When using Inference Providers, endpoint should be left blank - InferenceClient handles routing automatically
+        if (
+            this.endpointUrl &&
+            !this.model.includes(':') &&
+            !this.endpointUrl.includes('/v1/chat/completions') &&
+            !this.endpointUrl.includes('router.huggingface.co')
+        ) {
+            return client.endpoint(this.endpointUrl)
+        }
+
+        // Return client without endpoint override - InferenceClient will use Inference Providers automatically
+        return client
    }

    /** @ignore */
    static async imports(): Promise<{
-        HfInference: typeof import('@huggingface/inference').HfInference
+        InferenceClient: typeof import('@huggingface/inference').InferenceClient
    }> {
        try {
-            const { HfInference } = await import('@huggingface/inference')
-            return { HfInference }
+            const { InferenceClient } = await import('@huggingface/inference')
+            return { InferenceClient }
        } catch (e) {
            throw new Error('Please install huggingface as a dependency with, e.g. `pnpm install @huggingface/inference`')
        }
--- a/packages/components/nodes/embeddings/HuggingFaceInferenceEmbedding/core.ts
+++ b/packages/components/nodes/embeddings/HuggingFaceInferenceEmbedding/core.ts
@ -23,24 +23,22 @@ export class HuggingFaceInferenceEmbeddings extends Embeddings implements Huggin
        this.model = fields?.model ?? 'sentence-transformers/distilbert-base-nli-mean-tokens'
        this.apiKey = fields?.apiKey ?? getEnvironmentVariable('HUGGINGFACEHUB_API_KEY')
        this.endpoint = fields?.endpoint ?? ''
-        this.client = new HfInference(this.apiKey)
-        if (this.endpoint) this.client.endpoint(this.endpoint)
+        const hf = new HfInference(this.apiKey)
+        // v4 uses Inference Providers by default; only override if custom endpoint provided
+        this.client = this.endpoint ? hf.endpoint(this.endpoint) : hf
    }

    async _embed(texts: string[]): Promise<number[][]> {
        // replace newlines, which can negatively affect performance.
        const clean = texts.map((text) => text.replace(/\n/g, ' '))
-        const hf = new HfInference(this.apiKey)
        const obj: any = {
            inputs: clean
        }
-        if (this.endpoint) {
-            hf.endpoint(this.endpoint)
-        } else {
+        if (!this.endpoint) {
            obj.model = this.model
        }

-        const res = await this.caller.callWithOptions({}, hf.featureExtraction.bind(hf), obj)
+        const res = await this.caller.callWithOptions({}, this.client.featureExtraction.bind(this.client), obj)
        return res as number[][]
    }

--- a/packages/components/nodes/llms/HuggingFaceInference/core.ts
+++ b/packages/components/nodes/llms/HuggingFaceInference/core.ts
@ -78,6 +78,8 @@ export class HuggingFaceInference extends LLM implements HFInput {
    async _call(prompt: string, options: this['ParsedCallOptions']): Promise<string> {
        const { HfInference } = await HuggingFaceInference.imports()
        const hf = new HfInference(this.apiKey)
+        // v4 uses Inference Providers by default; only override if custom endpoint provided
+        const hfClient = this.endpoint ? hf.endpoint(this.endpoint) : hf
        const obj: any = {
            parameters: {
                // make it behave similar to openai, returning only the generated text
@ -90,12 +92,10 @@ export class HuggingFaceInference extends LLM implements HFInput {
            },
            inputs: prompt
        }
-        if (this.endpoint) {
-            hf.endpoint(this.endpoint)
-        } else {
+        if (!this.endpoint) {
            obj.model = this.model
        }
-        const res = await this.caller.callWithOptions({ signal: options.signal }, hf.textGeneration.bind(hf), obj)
+        const res = await this.caller.callWithOptions({ signal: options.signal }, hfClient.textGeneration.bind(hfClient), obj)
        return res.generated_text
    }

--- a/packages/components/package.json
+++ b/packages/components/package.json
@ -43,7 +43,7 @@
        "@google-cloud/storage": "^7.15.2",
        "@google/generative-ai": "^0.24.0",
        "@grpc/grpc-js": "^1.10.10",
-        "@huggingface/inference": "^2.6.1",
+        "@huggingface/inference": "^4.13.2",
        "@langchain/anthropic": "0.3.33",
        "@langchain/aws": "^0.1.11",
        "@langchain/baidu-qianfan": "^0.1.0",
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml