| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- import os
- from pathlib import Path
- from llama_index.core.node_parser import CodeSplitter
- from llama_index.core.schema import BaseNode
- from llama_index.readers.file import FlatReader
- Languages = {
- 'c': "c",
- 'cpp': "cpp",
- 'go': "go",
- 'java': "java",
- 'js': "javascript",
- 'md': "markdown",
- 'py': "python",
- 'ts': "typescript",
- }
- def traverse(repo_path: str) -> list[str]:
- """
- Traverse the directory, fetch all files
- - skip hidden directories
- - only keep the supported files
- :param repo_path: path to this repo
- """
- def helper(root):
- for entry in os.scandir(root):
- if entry.name.startswith('.'):
- continue
- if entry.is_file():
- ext = entry.name.split('.')[-1].lower()
- if ext not in Languages.keys():
- continue
- file_paths.append(entry.path)
- elif entry.is_dir():
- helper(entry.path)
- file_paths = []
- helper(repo_path)
- return sorted(file_paths)
- def split_into_chunks(file_path, lines_per_chunk, lines_overlap, max_chars) -> list[BaseNode]:
- """
- Split file into chunks
- :param file_path: path to the file
- :param lines_per_chunk: lines for each chunk
- :param lines_overlap: overlap lines between 2 chunks
- :param max_chars: max characters for each chunk
- """
- ext = file_path.split('.')[-1].lower()
- lang = Languages.get(ext, None)
- if not lang:
- return []
- try:
- documents = FlatReader().load_data(Path(file_path))
- splitter = CodeSplitter(
- language=lang,
- chunk_lines=lines_per_chunk,
- chunk_lines_overlap=lines_overlap,
- max_chars=max_chars,
- )
- return splitter.get_nodes_from_documents(documents)
- except Exception as e:
- print(f'`{file_path}`切分失败: {e}')
- return []
|