Skip to content

Commit

Permalink
Merge pull request #13 from bhavnicksm/development
Browse files Browse the repository at this point in the history
Run Black + Isort + beautify the code a bit
  • Loading branch information
bhavnicksm authored Nov 8, 2024
2 parents 0f45703 + 89249cc commit 745e5e8
Show file tree
Hide file tree
Showing 14 changed files with 593 additions and 397 deletions.
17 changes: 13 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "chonkie"
version = "0.1.1"
version = "0.1.2"
description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library"
readme = "README.md"
requires-python = ">=3.8"
Expand All @@ -20,10 +20,19 @@ classifiers = [
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11"
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Intended Audience :: Information Technology",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Text Processing :: Linguistic"
]
dependencies = [
"autotiktokenizer", "tokenizers>=0.13.0"
"autotiktokenizer", "tokenizers>=0.13.0", "tiktoken>=0.2.0"
]

[project.urls]
Expand All @@ -33,7 +42,7 @@ Homepage = "https://github.com/bhavnicksm/chonkie"
sentence = ["spacy>=3.0.0"]
semantic = ["sentence-transformers>=2.0.0", "numpy>=1.23.0"]
all = ["spacy>=3.0.0", "sentence-transformers>=2.0.0", "numpy>=1.23.0"]
dev = ["pytest>=6.2.0", "tranformers>=4.0.0", "tiktoken>=0.2.0"]
dev = ["pytest>=6.2.0", "transformers>=4.0.0", "tiktoken>=0.2.0", "black>=21.12b0", "isort>=5.10.2", "flake8>=4.0.0", "mypy>=0.910", "pylint>=2.11.1", "pre-commit>=2.15.0"]

[tool.setuptools]
package-dir = {"" = "src"}
Expand Down
27 changes: 9 additions & 18 deletions src/chonkie/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,9 @@
from .chunker import (
BaseChunker,
TokenChunker,
WordChunker,
SentenceChunker,
SemanticChunker,
SDPMChunker,
Chunk,
SentenceChunk,
SemanticChunk,
Sentence,
SemanticSentence
)
from .chunker import (BaseChunker, Chunk, SDPMChunker, SemanticChunk,
SemanticChunker, SemanticSentence, Sentence,
SentenceChunk, SentenceChunker, TokenChunker,
WordChunker)

__version__ = "0.1.1"
__version__ = "0.1.2"
__name__ = "chonkie"
__author__ = "Bhavnick Minhas"

Expand All @@ -21,14 +12,14 @@
"__version__",
"__author__",
"Sentence",
"SemanticSentence",
"SemanticSentence",
"Chunk",
"SentenceChunk",
"SemanticChunk",
"BaseChunker",
"TokenChunker",
"TokenChunker",
"WordChunker",
"SentenceChunker",
"SemanticChunker",
"SDPMChunker"
]
"SDPMChunker",
]
15 changes: 7 additions & 8 deletions src/chonkie/chunker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from .base import Chunk, BaseChunker
from .base import BaseChunker, Chunk
from .sdpm import SDPMChunker
from .semantic import SemanticChunk, SemanticChunker, SemanticSentence
from .sentence import Sentence, SentenceChunk, SentenceChunker
from .token import TokenChunker
from .word import WordChunker
from .sentence import Sentence, SentenceChunk, SentenceChunker
from .semantic import SemanticSentence, SemanticChunk, SemanticChunker
from .sdpm import SDPMChunker


__all__ = [
"Chunk",
"Chunk",
"BaseChunker",
"TokenChunker",
"WordChunker",
Expand All @@ -17,5 +16,5 @@
"SemanticSentence",
"SemanticChunk",
"SemanticChunker",
"SDPMChunker"
]
"SDPMChunker",
]
57 changes: 35 additions & 22 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,29 @@
from typing import List
from dataclasses import dataclass
import importlib
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List

import importlib

@dataclass
class Chunk:
"""Dataclass representing a text chunk with metadata."""

text: str
start_index: int
end_index: int
token_count: int


class BaseChunker(ABC):
"""Abstract base class for all chunker implementations.
All chunker implementations should inherit from this class and implement
the chunk() method according to their specific chunking strategy.
"""

def __init__(self, tokenizer):
"""Initialize the chunker with a tokenizer.
Args:
tokenizer: Tokenizer object to be used for tokenizing text
"""
Expand All @@ -40,12 +43,13 @@ def _get_tokenizer_backend(self):
return "tiktoken"
else:
raise ValueError("Tokenizer backend not supported")

def _load_tokenizer(self, tokenizer_name: str):
"""Load a tokenizer based on the backend."""
try:
try:
if importlib.util.find_spec("tiktoken") is not None:
from tiktoken import get_encoding

self._tokenizer_backend = "tiktoken"
return get_encoding(tokenizer_name)
else:
Expand All @@ -54,27 +58,36 @@ def _load_tokenizer(self, tokenizer_name: str):
try:
if importlib.util.find_spec("autotiktokenizer") is not None:
from autotiktokenizer import AutoTikTokenizer

self._tokenizer_backend = "tiktoken"
return AutoTikTokenizer.from_pretrained(tokenizer_name)
else:
raise Warning("AutoTikTokenizer library not found. Trying tokenizers.")
raise Warning(
"AutoTikTokenizer library not found. Trying tokenizers."
)
except Exception:
try:
if importlib.util.find_spec("tokenizers") is not None:
from tokenizers import Tokenizer

self._tokenizer_backend = "tokenizers"
return Tokenizer.from_pretrained(tokenizer_name)
else:
raise Warning("Tokenizers library not found. Trying transformers.")
raise Warning(
"Tokenizers library not found. Trying transformers."
)
except Exception:
try:
if importlib.util.find_spec("transformers") is not None:
from transformers import AutoTokenizer
self._tokenizer_backend = "transformers"

self._tokenizer_backend = "transformers"
return AutoTokenizer.from_pretrained(tokenizer_name)
except Exception:
raise ValueError("Tokenizer not found in the following libraries: transformers, tokenizers, autotiktokenizer, tiktoken",
"Please install one of these libraries to use the chunker.")
raise ValueError(
"Tokenizer not found in the following libraries: transformers, tokenizers, autotiktokenizer, tiktoken",
"Please install one of these libraries to use the chunker.",
)

def _encode(self, text: str):
"""Encode text using the backend tokenizer."""
Expand All @@ -86,11 +99,11 @@ def _encode(self, text: str):
return self.tokenizer.encode(text)
else:
raise ValueError("Tokenizer backend not supported.")

def _encode_batch(self, texts: List[str]):
"""Encode a batch of texts using the backend tokenizer."""
if self._tokenizer_backend == "transformers":
return self.tokenizer.batch_encode_plus(texts)['input_ids']
return self.tokenizer.batch_encode_plus(texts)["input_ids"]
elif self._tokenizer_backend == "tokenizers":
return self.tokenizer.encode_batch(texts)
elif self._tokenizer_backend == "tiktoken":
Expand All @@ -108,7 +121,7 @@ def _decode(self, tokens) -> str:
return self.tokenizer.decode(tokens)
else:
raise ValueError("Tokenizer backend not supported.")

def _decode_batch(self, token_lists: List[List[int]]) -> List[str]:
"""Decode a batch of token lists using the backend tokenizer."""
if self._tokenizer_backend == "transformers":
Expand All @@ -119,30 +132,30 @@ def _decode_batch(self, token_lists: List[List[int]]) -> List[str]:
return [self.tokenizer.decode(tokens) for tokens in token_lists]
else:
raise ValueError("Tokenizer backend not supported.")

@abstractmethod
def chunk(self, text: str) -> List[Chunk]:
"""Split text into chunks according to the implementation strategy.
Args:
text: Input text to be chunked
Returns:
List of Chunk objects containing the chunked text and metadata
"""
pass

def __call__(self, text: str) -> List[Chunk]:
"""Make the chunker callable directly.
Args:
text: Input text to be chunked
Returns:
List of Chunk objects containing the chunked text and metadata
"""
return self.chunk(text)

def __repr__(self) -> str:
"""Return string representation of the chunker."""
return f"{self.__class__.__name__}()"
return f"{self.__class__.__name__}()"
Loading

0 comments on commit 745e5e8

Please sign in to comment.