Merge pull request #1039 from FlowiseAI/feature/S3
Feature/Add S3 loader
This commit is contained in:
commit
c419274c06
|
|
@ -0,0 +1,241 @@
|
|||
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||
import { S3Loader } from 'langchain/document_loaders/web/s3'
|
||||
import { UnstructuredLoader } from 'langchain/document_loaders/fs/unstructured'
|
||||
import { getCredentialData, getCredentialParam } from '../../../src/utils'
|
||||
import { S3Client, GetObjectCommand, S3ClientConfig } from '@aws-sdk/client-s3'
|
||||
import { Readable } from 'node:stream'
|
||||
import * as fsDefault from 'node:fs'
|
||||
import * as path from 'node:path'
|
||||
import * as os from 'node:os'
|
||||
|
||||
type S3Config = S3ClientConfig & {
|
||||
/** @deprecated Use the credentials object instead */
|
||||
accessKeyId?: string
|
||||
/** @deprecated Use the credentials object instead */
|
||||
secretAccessKey?: string
|
||||
}
|
||||
|
||||
class S3_DocumentLoaders implements INode {
|
||||
label: string
|
||||
name: string
|
||||
version: number
|
||||
description: string
|
||||
type: string
|
||||
icon: string
|
||||
category: string
|
||||
baseClasses: string[]
|
||||
credential: INodeParams
|
||||
inputs?: INodeParams[]
|
||||
|
||||
constructor() {
|
||||
this.label = 'S3'
|
||||
this.name = 'S3'
|
||||
this.version = 1.0
|
||||
this.type = 'Document'
|
||||
this.icon = 's3.svg'
|
||||
this.category = 'Document Loaders'
|
||||
this.description = 'Load Data from S3 Buckets'
|
||||
this.baseClasses = [this.type]
|
||||
this.credential = {
|
||||
label: 'AWS Credential',
|
||||
name: 'credential',
|
||||
type: 'credential',
|
||||
credentialNames: ['awsApi']
|
||||
}
|
||||
this.inputs = [
|
||||
{
|
||||
label: 'Bucket',
|
||||
name: 'bucketName',
|
||||
type: 'string'
|
||||
},
|
||||
{
|
||||
label: 'Object Key',
|
||||
name: 'keyName',
|
||||
type: 'string',
|
||||
description: 'The object key (or key name) that uniquely identifies object in an Amazon S3 bucket',
|
||||
placeholder: 'AI-Paper.pdf'
|
||||
},
|
||||
{
|
||||
label: 'Region',
|
||||
name: 'region',
|
||||
type: 'options',
|
||||
options: [
|
||||
{ label: 'af-south-1', name: 'af-south-1' },
|
||||
{ label: 'ap-east-1', name: 'ap-east-1' },
|
||||
{ label: 'ap-northeast-1', name: 'ap-northeast-1' },
|
||||
{ label: 'ap-northeast-2', name: 'ap-northeast-2' },
|
||||
{ label: 'ap-northeast-3', name: 'ap-northeast-3' },
|
||||
{ label: 'ap-south-1', name: 'ap-south-1' },
|
||||
{ label: 'ap-south-2', name: 'ap-south-2' },
|
||||
{ label: 'ap-southeast-1', name: 'ap-southeast-1' },
|
||||
{ label: 'ap-southeast-2', name: 'ap-southeast-2' },
|
||||
{ label: 'ap-southeast-3', name: 'ap-southeast-3' },
|
||||
{ label: 'ap-southeast-4', name: 'ap-southeast-4' },
|
||||
{ label: 'ap-southeast-5', name: 'ap-southeast-5' },
|
||||
{ label: 'ap-southeast-6', name: 'ap-southeast-6' },
|
||||
{ label: 'ca-central-1', name: 'ca-central-1' },
|
||||
{ label: 'ca-west-1', name: 'ca-west-1' },
|
||||
{ label: 'cn-north-1', name: 'cn-north-1' },
|
||||
{ label: 'cn-northwest-1', name: 'cn-northwest-1' },
|
||||
{ label: 'eu-central-1', name: 'eu-central-1' },
|
||||
{ label: 'eu-central-2', name: 'eu-central-2' },
|
||||
{ label: 'eu-north-1', name: 'eu-north-1' },
|
||||
{ label: 'eu-south-1', name: 'eu-south-1' },
|
||||
{ label: 'eu-south-2', name: 'eu-south-2' },
|
||||
{ label: 'eu-west-1', name: 'eu-west-1' },
|
||||
{ label: 'eu-west-2', name: 'eu-west-2' },
|
||||
{ label: 'eu-west-3', name: 'eu-west-3' },
|
||||
{ label: 'il-central-1', name: 'il-central-1' },
|
||||
{ label: 'me-central-1', name: 'me-central-1' },
|
||||
{ label: 'me-south-1', name: 'me-south-1' },
|
||||
{ label: 'sa-east-1', name: 'sa-east-1' },
|
||||
{ label: 'us-east-1', name: 'us-east-1' },
|
||||
{ label: 'us-east-2', name: 'us-east-2' },
|
||||
{ label: 'us-gov-east-1', name: 'us-gov-east-1' },
|
||||
{ label: 'us-gov-west-1', name: 'us-gov-west-1' },
|
||||
{ label: 'us-west-1', name: 'us-west-1' },
|
||||
{ label: 'us-west-2', name: 'us-west-2' }
|
||||
],
|
||||
default: 'us-east-1'
|
||||
},
|
||||
{
|
||||
label: 'Unstructured API URL',
|
||||
name: 'unstructuredAPIUrl',
|
||||
description:
|
||||
'Your Unstructured.io URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started',
|
||||
type: 'string',
|
||||
default: 'http://localhost:8000/general/v0/general'
|
||||
},
|
||||
{
|
||||
label: 'Unstructured API KEY',
|
||||
name: 'unstructuredAPIKey',
|
||||
type: 'password',
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
label: 'NarrativeText Only',
|
||||
name: 'narrativeTextOnly',
|
||||
description:
|
||||
'Only load documents with NarrativeText metadata from Unstructured. See how Unstructured partition data <a target="_blank" href="https://unstructured-io.github.io/unstructured/bricks/partition.html#">here</a>',
|
||||
default: true,
|
||||
type: 'boolean',
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
},
|
||||
{
|
||||
label: 'Metadata',
|
||||
name: 'metadata',
|
||||
type: 'json',
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
}
|
||||
]
|
||||
}
|
||||
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
|
||||
const bucketName = nodeData.inputs?.bucketName as string
|
||||
const keyName = nodeData.inputs?.keyName as string
|
||||
const region = nodeData.inputs?.region as string
|
||||
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
|
||||
const unstructuredAPIKey = nodeData.inputs?.unstructuredAPIKey as string
|
||||
const metadata = nodeData.inputs?.metadata
|
||||
const narrativeTextOnly = nodeData.inputs?.narrativeTextOnly as boolean
|
||||
|
||||
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
|
||||
const accessKeyId = getCredentialParam('awsKey', credentialData, nodeData)
|
||||
const secretAccessKey = getCredentialParam('awsSecret', credentialData, nodeData)
|
||||
|
||||
const loader = new S3Loader({
|
||||
bucket: bucketName,
|
||||
key: keyName,
|
||||
s3Config: {
|
||||
region,
|
||||
credentials: {
|
||||
accessKeyId,
|
||||
secretAccessKey
|
||||
}
|
||||
},
|
||||
unstructuredAPIURL: unstructuredAPIUrl,
|
||||
unstructuredAPIKey: unstructuredAPIKey
|
||||
})
|
||||
|
||||
const s3Config: S3Config & {
|
||||
accessKeyId?: string
|
||||
secretAccessKey?: string
|
||||
} = {
|
||||
accessKeyId,
|
||||
secretAccessKey
|
||||
}
|
||||
|
||||
loader.load = async () => {
|
||||
const tempDir = fsDefault.mkdtempSync(path.join(os.tmpdir(), 's3fileloader-'))
|
||||
|
||||
const filePath = path.join(tempDir, keyName)
|
||||
|
||||
try {
|
||||
const s3Client = new S3Client(s3Config)
|
||||
|
||||
const getObjectCommand = new GetObjectCommand({
|
||||
Bucket: bucketName,
|
||||
Key: keyName
|
||||
})
|
||||
|
||||
const response = await s3Client.send(getObjectCommand)
|
||||
|
||||
const objectData = await new Promise<Buffer>((resolve, reject) => {
|
||||
const chunks: Buffer[] = []
|
||||
|
||||
if (response.Body instanceof Readable) {
|
||||
response.Body.on('data', (chunk: Buffer) => chunks.push(chunk))
|
||||
response.Body.on('end', () => resolve(Buffer.concat(chunks)))
|
||||
response.Body.on('error', reject)
|
||||
} else {
|
||||
reject(new Error('Response body is not a readable stream.'))
|
||||
}
|
||||
})
|
||||
|
||||
fsDefault.mkdirSync(path.dirname(filePath), { recursive: true })
|
||||
|
||||
fsDefault.writeFileSync(filePath, objectData)
|
||||
} catch (e: any) {
|
||||
throw new Error(`Failed to download file ${keyName} from S3 bucket ${bucketName}: ${e.message}`)
|
||||
}
|
||||
|
||||
try {
|
||||
const options = {
|
||||
apiUrl: unstructuredAPIUrl,
|
||||
apiKey: unstructuredAPIKey
|
||||
}
|
||||
|
||||
const unstructuredLoader = new UnstructuredLoader(filePath, options)
|
||||
|
||||
const docs = await unstructuredLoader.load()
|
||||
|
||||
fsDefault.rmdirSync(path.dirname(filePath), { recursive: true })
|
||||
|
||||
return docs
|
||||
} catch {
|
||||
fsDefault.rmdirSync(path.dirname(filePath), { recursive: true })
|
||||
throw new Error(`Failed to load file ${filePath} using unstructured loader.`)
|
||||
}
|
||||
}
|
||||
|
||||
const docs = await loader.load()
|
||||
|
||||
if (metadata) {
|
||||
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
|
||||
const finaldocs = docs.map((doc) => {
|
||||
return {
|
||||
...doc,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
...parsedMetadata
|
||||
}
|
||||
}
|
||||
})
|
||||
return narrativeTextOnly ? finaldocs.filter((doc) => doc.metadata.category === 'NarrativeText') : finaldocs
|
||||
}
|
||||
|
||||
return narrativeTextOnly ? docs.filter((doc) => doc.metadata.category === 'NarrativeText') : docs
|
||||
}
|
||||
}
|
||||
module.exports = { nodeClass: S3_DocumentLoaders }
|
||||
|
|
@ -0,0 +1 @@
|
|||
<svg height="2500" width="2500" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 80 80"><linearGradient id="a" x1="0%" y1="100%" y2="0%"><stop offset="0" stop-color="#1b660f"/><stop offset="1" stop-color="#6cae3e"/></linearGradient><g fill="none" fill-rule="evenodd"><path d="M0 0h80v80H0z" fill="url(#a)"/><path d="M60.836 42.893l.384-2.704c3.54 2.12 3.587 2.997 3.586 3.02-.006.006-.61.51-3.97-.316zm-1.943-.54C52.773 40.5 44.25 36.59 40.8 34.96c0-.014.004-.027.004-.041a2.406 2.406 0 0 0-2.404-2.403c-1.324 0-2.402 1.078-2.402 2.403s1.078 2.403 2.402 2.403c.582 0 1.11-.217 1.527-.562 4.058 1.92 12.515 5.774 18.68 7.594L56.17 61.56a.955.955 0 0 0-.01.14c0 1.516-6.707 4.299-17.666 4.299-11.075 0-17.853-2.783-17.853-4.298 0-.046-.003-.091-.01-.136l-5.093-37.207c4.409 3.035 13.892 4.64 22.962 4.64 9.056 0 18.523-1.6 22.94-4.625zM15 20.478C15.072 19.162 22.634 14 38.5 14c15.864 0 23.427 5.16 23.5 6.478v.449C61.13 23.877 51.33 27 38.5 27c-12.852 0-22.657-3.132-23.5-6.087zm49 .022c0-3.465-9.934-8.5-25.5-8.5S13 17.035 13 20.5l.094.754 5.548 40.524C18.775 66.31 30.86 68 38.494 68c9.472 0 19.535-2.178 19.665-6.22l2.396-16.896c1.333.319 2.43.482 3.31.482 1.184 0 1.984-.29 2.469-.867a1.95 1.95 0 0 0 .436-1.66c-.26-1.383-1.902-2.875-5.248-4.784l2.376-16.762z" fill="#fff"/></g></svg>
|
||||
|
After Width: | Height: | Size: 1.3 KiB |
|
|
@ -18,6 +18,7 @@
|
|||
"dependencies": {
|
||||
"@aws-sdk/client-bedrock-runtime": "3.422.0",
|
||||
"@aws-sdk/client-dynamodb": "^3.360.0",
|
||||
"@aws-sdk/client-s3": "^3.427.0",
|
||||
"@dqbd/tiktoken": "^1.0.7",
|
||||
"@getzep/zep-js": "^0.6.3",
|
||||
"@gomomento/sdk": "^1.40.2",
|
||||
|
|
|
|||
Loading…
Reference in New Issue