知乎专栏 |
from langchain.document_loaders import TextLoader loader = TextLoader("./README.md") docs = loader.load() # print(docs) from langchain.text_splitter import CharacterTextSplitter text_splitter = CharacterTextSplitter( separator="\n\n", chunk_size=200, chunk_overlap=200, length_function=len, ) split_docs = text_splitter.split_documents(docs) for text in split_docs: print(text, end="\n\n") # print(split_docs)
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language PYTHON_CODE = """ def hello_langchain(): print("Hello, Langchain!") # Call the function hello_langchain() """ python_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=50, chunk_overlap=0) python_docs = python_splitter.create_documents([PYTHON_CODE]) print(python_docs)
from langchain.text_splitter import MarkdownHeaderTextSplitter markdown_document = "# Chapter 1\n\n ## Section 1\n\nHi this is the 1st section\n\nWelcome\n\n ### Module 1 \n\n Hi this is the first module \n\n ## Section 2\n\n Hi this is the 2nd section" headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ] splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) splits = splitter.split_text(markdown_document) print(splits)
例如 OpenAI 的 token 有字数限制。在API调用时不应超过 token 限制,使用 from_tiktoken_encoder 可以解决这个问题。
import openai openai.api_key = "sk-UB5STnKrdJmEHT3BlbkFJnPC9GjYuY0sAQzyuotulWBFg70v" from langchain.document_loaders import TextLoader loader = TextLoader("./README.md") docs = loader.load() # print(docs) from langchain.text_splitter import CharacterTextSplitter text_splitter = CharacterTextSplitter.from_tiktoken_encoder( chunk_size=100, chunk_overlap=0 ) split_docs = text_splitter.split_documents(docs) print(split_docs)