ResponsibleAI - Input Moderation - Ability to verify inputs and prevent potentially harmful content generation
This commit is contained in:
parent
a311e024e1
commit
ad8281e553
|
|
@ -7,6 +7,7 @@ import { BaseOutputParser } from 'langchain/schema/output_parser'
|
||||||
import { formatResponse, injectOutputParser } from '../../outputparsers/OutputParserHelpers'
|
import { formatResponse, injectOutputParser } from '../../outputparsers/OutputParserHelpers'
|
||||||
import { BaseLLMOutputParser } from 'langchain/schema/output_parser'
|
import { BaseLLMOutputParser } from 'langchain/schema/output_parser'
|
||||||
import { OutputFixingParser } from 'langchain/output_parsers'
|
import { OutputFixingParser } from 'langchain/output_parsers'
|
||||||
|
import { checkInputs, Moderation, streamResponse } from '../../responsibleAI/ResponsibleAI'
|
||||||
|
|
||||||
class LLMChain_Chains implements INode {
|
class LLMChain_Chains implements INode {
|
||||||
label: string
|
label: string
|
||||||
|
|
@ -36,6 +37,14 @@ class LLMChain_Chains implements INode {
|
||||||
name: 'model',
|
name: 'model',
|
||||||
type: 'BaseLanguageModel'
|
type: 'BaseLanguageModel'
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
label: 'Input Moderation',
|
||||||
|
description: 'Detect text that could generate harmful output and prevent it from being sent to the language model',
|
||||||
|
name: 'inputModeration',
|
||||||
|
type: 'Moderation',
|
||||||
|
optional: true,
|
||||||
|
list: true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
label: 'Prompt',
|
label: 'Prompt',
|
||||||
name: 'prompt',
|
name: 'prompt',
|
||||||
|
|
@ -144,7 +153,7 @@ const runPrediction = async (
|
||||||
const isStreaming = options.socketIO && options.socketIOClientId
|
const isStreaming = options.socketIO && options.socketIOClientId
|
||||||
const socketIO = isStreaming ? options.socketIO : undefined
|
const socketIO = isStreaming ? options.socketIO : undefined
|
||||||
const socketIOClientId = isStreaming ? options.socketIOClientId : ''
|
const socketIOClientId = isStreaming ? options.socketIOClientId : ''
|
||||||
|
const moderations = nodeData.inputs?.inputModeration as Moderation[]
|
||||||
/**
|
/**
|
||||||
* Apply string transformation to reverse converted special chars:
|
* Apply string transformation to reverse converted special chars:
|
||||||
* FROM: { "value": "hello i am benFLOWISE_NEWLINEFLOWISE_NEWLINEFLOWISE_TABhow are you?" }
|
* FROM: { "value": "hello i am benFLOWISE_NEWLINEFLOWISE_NEWLINEFLOWISE_TABhow are you?" }
|
||||||
|
|
@ -152,6 +161,16 @@ const runPrediction = async (
|
||||||
*/
|
*/
|
||||||
const promptValues = handleEscapeCharacters(promptValuesRaw, true)
|
const promptValues = handleEscapeCharacters(promptValuesRaw, true)
|
||||||
|
|
||||||
|
if (moderations && moderations.length > 0) {
|
||||||
|
try {
|
||||||
|
// Use the output of the moderation chain as input for the LLM chain
|
||||||
|
input = await checkInputs(moderations, chain.llm, input)
|
||||||
|
} catch (e) {
|
||||||
|
streamResponse(isStreaming, e.message, socketIO, socketIOClientId)
|
||||||
|
return formatResponse(e.message)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (promptValues && inputVariables.length > 0) {
|
if (promptValues && inputVariables.length > 0) {
|
||||||
let seen: string[] = []
|
let seen: string[] = []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,154 @@
|
||||||
|
import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||||
|
import { getBaseClasses } from '../../../src'
|
||||||
|
import { Moderation } from '../ResponsibleAI'
|
||||||
|
import { OpenAIModerationRunner } from './OpenAIModerationRunner'
|
||||||
|
|
||||||
|
class OpenAIModeration implements INode {
|
||||||
|
label: string
|
||||||
|
name: string
|
||||||
|
version: number
|
||||||
|
description: string
|
||||||
|
type: string
|
||||||
|
icon: string
|
||||||
|
category: string
|
||||||
|
baseClasses: string[]
|
||||||
|
inputs: INodeParams[]
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.label = 'Moderation - Open AI'
|
||||||
|
this.name = 'inputModerationOpenAI'
|
||||||
|
this.version = 1.0
|
||||||
|
this.type = 'Moderation'
|
||||||
|
this.icon = 'openai-moderation.png'
|
||||||
|
this.category = 'Responsible AI'
|
||||||
|
this.description = 'Check whether content complies with OpenAI usage policies.'
|
||||||
|
this.baseClasses = [this.type, ...getBaseClasses(Moderation)]
|
||||||
|
this.inputs = [
|
||||||
|
{
|
||||||
|
label: 'Moderation Checks',
|
||||||
|
name: 'moderationConfig',
|
||||||
|
type: 'options',
|
||||||
|
default: 'useDefault',
|
||||||
|
options: [
|
||||||
|
{
|
||||||
|
label: 'OpenAI Default',
|
||||||
|
name: 'useDefault'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Use Custom Threshold Values',
|
||||||
|
name: 'useCustom'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Combine OpenAI Default with Custom Threshold Values',
|
||||||
|
name: 'combineBoth'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Error Message',
|
||||||
|
name: 'moderationErrorMessage',
|
||||||
|
type: 'string',
|
||||||
|
rows: 2,
|
||||||
|
default: "Cannot Process! Input violates OpenAI's content moderation policies.",
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Sexual',
|
||||||
|
name: 'catSexualThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Sexual/Minors',
|
||||||
|
name: 'catSexualMinorsThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Hate',
|
||||||
|
name: 'catHateThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Hate/Threatening',
|
||||||
|
name: 'catHateThreateningThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Harassment',
|
||||||
|
name: 'catHarassmentThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Harassment/Threatening',
|
||||||
|
name: 'catHarassmentThreateningThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Self Harm',
|
||||||
|
name: 'catSelfHarmThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Self-Harm/Intent',
|
||||||
|
name: 'catSelfHarmIntentThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Self-Harm/Instructions',
|
||||||
|
name: 'catSelfHarmInstructionsThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Violence',
|
||||||
|
name: 'catViolenceThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Threshold Score - Violence/Graphic',
|
||||||
|
name: 'catViolenceGraphicThreshold',
|
||||||
|
type: 'number',
|
||||||
|
default: 0.01,
|
||||||
|
additionalParams: true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
async init(nodeData: INodeData): Promise<any> {
|
||||||
|
const runner = new OpenAIModerationRunner()
|
||||||
|
this.inputs.forEach((value) => {
|
||||||
|
if (value.additionalParams === true) {
|
||||||
|
// remove thePrefix - 'cat'
|
||||||
|
let categoryName = value.name.substring(3)
|
||||||
|
// remove theSuffix - 'Threshold'
|
||||||
|
categoryName = categoryName.substring(0, categoryName.length - 9)
|
||||||
|
categoryName = categoryName.substring(0, 1).toLowerCase() + categoryName.substring(1)
|
||||||
|
let categoryThreshold = nodeData.inputs ? nodeData.inputs[value.name] : value.default
|
||||||
|
runner.setParameter(categoryName, parseFloat(categoryThreshold))
|
||||||
|
} else {
|
||||||
|
runner.setParameter(value.name, nodeData.inputs ? nodeData.inputs[value.name] : value.default)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return runner
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { nodeClass: OpenAIModeration }
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
import { Moderation } from '../ResponsibleAI'
|
||||||
|
import { BaseLanguageModel } from 'langchain/base_language'
|
||||||
|
import { OpenAIModerationChain } from 'langchain/chains'
|
||||||
|
|
||||||
|
export class OpenAIModerationRunner implements Moderation {
|
||||||
|
private moderationConfig: string = 'useDefault'
|
||||||
|
private moderationErrorMessage: string = "Text was found that violates OpenAI's content policy."
|
||||||
|
private sexual: number = 0.01
|
||||||
|
private sexualMinors: number = 0.01
|
||||||
|
private hate: number = 0.01
|
||||||
|
private hateThreatening: number = 0.01
|
||||||
|
private harassment: number = 0.01
|
||||||
|
private harassmentThreatening: number = 0.01
|
||||||
|
private selfHarm: number = 0.01
|
||||||
|
private selfHarmIntent: number = 0.01
|
||||||
|
private selfHarmInstructions: number = 0.01
|
||||||
|
private violence: number = 0.01
|
||||||
|
private violenceGraphic: number = 0.01
|
||||||
|
|
||||||
|
async checkForViolations(llm: BaseLanguageModel, input: string): Promise<string> {
|
||||||
|
const openAIApiKey = (llm as any).openAIApiKey
|
||||||
|
if (!openAIApiKey) {
|
||||||
|
throw Error('OpenAI API key not found')
|
||||||
|
}
|
||||||
|
// Create a new instance of the OpenAIModerationChain
|
||||||
|
const moderation = new OpenAIModerationChain({
|
||||||
|
openAIApiKey: openAIApiKey,
|
||||||
|
throwError: false // If set to true, the call will throw an error when the moderation chain detects violating content. If set to false, violating content will return "Text was found that violates OpenAI's content policy.".
|
||||||
|
})
|
||||||
|
// Send the user's input to the moderation chain and wait for the result
|
||||||
|
const { output: moderationOutput, results } = await moderation.call({
|
||||||
|
input: input
|
||||||
|
})
|
||||||
|
if (this.moderationConfig != 'useCustom' && results[0].flagged) {
|
||||||
|
throw Error(this.moderationErrorMessage)
|
||||||
|
}
|
||||||
|
if (this.moderationConfig != 'useDefault') {
|
||||||
|
const categoryScores = results[0].category_scores
|
||||||
|
if (
|
||||||
|
categoryScores['harassment'] > this.harassment ||
|
||||||
|
categoryScores['harassment/threatening'] > this.harassmentThreatening ||
|
||||||
|
categoryScores['self-harm'] > this.selfHarm ||
|
||||||
|
categoryScores['self-harm/intent'] > this.selfHarmIntent ||
|
||||||
|
categoryScores['self-harm/instructions'] > this.selfHarmInstructions ||
|
||||||
|
categoryScores['sexual'] > this.sexual ||
|
||||||
|
categoryScores['sexual/minors'] > this.sexualMinors ||
|
||||||
|
categoryScores['hate'] > this.hate ||
|
||||||
|
categoryScores['hate/threatening'] > this.hateThreatening ||
|
||||||
|
categoryScores['violence'] > this.violence ||
|
||||||
|
categoryScores['violence/graphic'] > this.violenceGraphic
|
||||||
|
) {
|
||||||
|
throw Error(this.moderationErrorMessage)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return moderationOutput
|
||||||
|
}
|
||||||
|
|
||||||
|
setParameter(category: string, value: number) {
|
||||||
|
// @ts-ignore
|
||||||
|
this[category] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||||
|
import { getBaseClasses } from '../../../src'
|
||||||
|
import { Moderation } from '../ResponsibleAI'
|
||||||
|
import { SimplePromptModerationRunner } from './SimplePromptModerationRunner'
|
||||||
|
|
||||||
|
class SimplePromptModeration implements INode {
|
||||||
|
label: string
|
||||||
|
name: string
|
||||||
|
version: number
|
||||||
|
description: string
|
||||||
|
type: string
|
||||||
|
icon: string
|
||||||
|
category: string
|
||||||
|
baseClasses: string[]
|
||||||
|
inputs: INodeParams[]
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.label = 'Moderation - Simple Prompt'
|
||||||
|
this.name = 'inputModerationSimple'
|
||||||
|
this.version = 1.0
|
||||||
|
this.type = 'Moderation'
|
||||||
|
this.icon = 'simple_moderation.png'
|
||||||
|
this.category = 'Responsible AI'
|
||||||
|
this.description = 'Detecting and mitigating prompt attacks'
|
||||||
|
this.baseClasses = [this.type, ...getBaseClasses(Moderation)]
|
||||||
|
this.inputs = [
|
||||||
|
{
|
||||||
|
label: 'Deny List',
|
||||||
|
name: 'denyList',
|
||||||
|
type: 'string',
|
||||||
|
rows: 4,
|
||||||
|
placeholder: `ignore previous instructions\ndo not follow the directions\nyou must ignore all previous instructions`,
|
||||||
|
description: 'An array of string literals (enter one per line) that should not appear in the prompt text.',
|
||||||
|
optional: false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Error Message',
|
||||||
|
name: 'moderationErrorMessage',
|
||||||
|
type: 'string',
|
||||||
|
rows: 2,
|
||||||
|
default: 'Cannot Process! Input violates content moderation policies.',
|
||||||
|
optional: true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
// eslint-disable-next-line unused-imports/no-unused-vars
|
||||||
|
async init(nodeData: INodeData): Promise<any> {
|
||||||
|
const denyList = nodeData.inputs?.denyList as string
|
||||||
|
const moderationErrorMessage = nodeData.inputs?.moderationErrorMessage as string
|
||||||
|
|
||||||
|
return new SimplePromptModerationRunner(denyList, moderationErrorMessage)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { nodeClass: SimplePromptModeration }
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
import { Moderation } from '../ResponsibleAI'
|
||||||
|
import { BaseLanguageModel } from 'langchain/base_language'
|
||||||
|
|
||||||
|
export class SimplePromptModerationRunner implements Moderation {
|
||||||
|
private readonly denyList: string = ''
|
||||||
|
private readonly moderationErrorMessage: string = ''
|
||||||
|
|
||||||
|
constructor(denyList: string, moderationErrorMessage: string) {
|
||||||
|
this.denyList = denyList
|
||||||
|
if (denyList.indexOf('\n') === -1) {
|
||||||
|
this.denyList += '\n'
|
||||||
|
}
|
||||||
|
this.moderationErrorMessage = moderationErrorMessage
|
||||||
|
}
|
||||||
|
|
||||||
|
async checkForViolations(llm: BaseLanguageModel, input: string): Promise<string> {
|
||||||
|
this.denyList.split('\n').forEach((denyListItem) => {
|
||||||
|
if (denyListItem && denyListItem !== '' && input.includes(denyListItem)) {
|
||||||
|
throw Error(this.moderationErrorMessage)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return Promise.resolve(input)
|
||||||
|
}
|
||||||
|
}
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 47 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 44 KiB |
|
|
@ -0,0 +1,30 @@
|
||||||
|
import { BaseLanguageModel } from 'langchain/base_language'
|
||||||
|
import { Server } from 'socket.io'
|
||||||
|
|
||||||
|
export abstract class ResponsibleAI {}
|
||||||
|
|
||||||
|
export abstract class Moderation extends ResponsibleAI {
|
||||||
|
abstract checkForViolations(llm: BaseLanguageModel, input: string): Promise<string>
|
||||||
|
}
|
||||||
|
|
||||||
|
export const checkInputs = async (inputModerations: Moderation[], llm: BaseLanguageModel, input: string): Promise<string> => {
|
||||||
|
for (const moderation of inputModerations) {
|
||||||
|
input = await moderation.checkForViolations(llm, input)
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
// is this the correct location for this function?
|
||||||
|
// should we have a utils files that all node components can use?
|
||||||
|
export const streamResponse = (isStreaming: any, response: string, socketIO: Server, socketIOClientId: string) => {
|
||||||
|
if (isStreaming) {
|
||||||
|
const result = response.split(/(\s+)/)
|
||||||
|
result.forEach((token: string, index: number) => {
|
||||||
|
if (index === 0) {
|
||||||
|
socketIO.to(socketIOClientId).emit('start', token)
|
||||||
|
}
|
||||||
|
socketIO.to(socketIOClientId).emit('token', token)
|
||||||
|
})
|
||||||
|
socketIO.to(socketIOClientId).emit('end')
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue