def create_deeplake_dataset_from_s3_prefix(
source_prefix: str,
target_prefix: str,
dataset_name: str,
dataset_type: str = 'text'
):
if dataset_type == 'text':
loader = S3DirectoryLoader(consts.DADOSFERA_LANDING_ZONE, prefix=source_prefix)
elif dataset_type == 'csv':
loader = S3DirectoryLoader(consts.DADOSFERA_LANDING_ZONE, prefix=source_prefix, loader=CSVLoader)
docs = loader.load_and_split()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)
embeddings = OpenAIEmbeddings()
dataset_path = f"s3://{consts.DADOSFERA_LANDING_ZONE}/{target_prefix}/{dataset_name}"
DeepLake.from_documents(
texts, embeddings, dataset_path=dataset_path
)
return dataset_path