Skip to content

Deeplake

dadosfera.services.deeplake.create_deeplake_dataset_from_s3_prefix

create_deeplake_dataset_from_s3_prefix(source_prefix, target_prefix, dataset_name, dataset_type='text')
Source code in dadosfera/services/deeplake.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def create_deeplake_dataset_from_s3_prefix(
    source_prefix: str,
    target_prefix: str,
    dataset_name: str,
    dataset_type: str = 'text'
):

    if dataset_type == 'text':
        loader = S3DirectoryLoader(consts.DADOSFERA_LANDING_ZONE, prefix=source_prefix)
    elif dataset_type == 'csv':
        loader = S3DirectoryLoader(consts.DADOSFERA_LANDING_ZONE, prefix=source_prefix, loader=CSVLoader)

    docs = loader.load_and_split()

    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    texts = text_splitter.split_documents(docs)
    embeddings = OpenAIEmbeddings()

    dataset_path = f"s3://{consts.DADOSFERA_LANDING_ZONE}/{target_prefix}/{dataset_name}"
    DeepLake.from_documents(
        texts, embeddings, dataset_path=dataset_path
    )

    return dataset_path