Merge pull request #13 from bhavnicksm/development

Run Black + Isort + beautify the code a bit
chonkie-ai · Nov 8, 2024 · 745e5e8 · 745e5e8
2 parents 0f45703 + 89249cc
commit 745e5e8
Show file tree

Hide file tree

Showing 14 changed files with 593 additions and 397 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "chonkie"
-version = "0.1.1"
+version = "0.1.2"
 description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -20,10 +20,19 @@ classifiers = [
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11"
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Information Technology",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+    "Topic :: Text Processing :: Linguistic"
 ]
 dependencies = [
-    "autotiktokenizer", "tokenizers>=0.13.0"
+    "autotiktokenizer", "tokenizers>=0.13.0", "tiktoken>=0.2.0"
 ]
 
 [project.urls]
@@ -33,7 +42,7 @@ Homepage = "https://github.com/bhavnicksm/chonkie"
 sentence = ["spacy>=3.0.0"]
 semantic = ["sentence-transformers>=2.0.0", "numpy>=1.23.0"]
 all = ["spacy>=3.0.0", "sentence-transformers>=2.0.0", "numpy>=1.23.0"]
-dev = ["pytest>=6.2.0", "tranformers>=4.0.0", "tiktoken>=0.2.0"]
+dev = ["pytest>=6.2.0", "transformers>=4.0.0", "tiktoken>=0.2.0", "black>=21.12b0", "isort>=5.10.2", "flake8>=4.0.0", "mypy>=0.910", "pylint>=2.11.1", "pre-commit>=2.15.0"]
 
 [tool.setuptools]
 package-dir = {"" = "src"}

diff --git a/src/chonkie/__init__.py b/src/chonkie/__init__.py
@@ -1,18 +1,9 @@
-from .chunker import (
-    BaseChunker,
-    TokenChunker,
-    WordChunker,
-    SentenceChunker,
-    SemanticChunker,
-    SDPMChunker,
-    Chunk, 
-    SentenceChunk,
-    SemanticChunk,
-    Sentence,
-    SemanticSentence
-)
+from .chunker import (BaseChunker, Chunk, SDPMChunker, SemanticChunk,
+                      SemanticChunker, SemanticSentence, Sentence,
+                      SentenceChunk, SentenceChunker, TokenChunker,
+                      WordChunker)
 
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 __name__ = "chonkie"
 __author__ = "Bhavnick Minhas"
 
@@ -21,14 +12,14 @@
     "__version__",
     "__author__",
     "Sentence",
-    "SemanticSentence", 
+    "SemanticSentence",
     "Chunk",
     "SentenceChunk",
     "SemanticChunk",
     "BaseChunker",
-    "TokenChunker", 
+    "TokenChunker",
     "WordChunker",
     "SentenceChunker",
     "SemanticChunker",
-    "SDPMChunker"
-]
+    "SDPMChunker",
+]
diff --git a/src/chonkie/chunker/__init__.py b/src/chonkie/chunker/__init__.py
@@ -1,13 +1,12 @@
-from .base import Chunk, BaseChunker
+from .base import BaseChunker, Chunk
+from .sdpm import SDPMChunker
+from .semantic import SemanticChunk, SemanticChunker, SemanticSentence
+from .sentence import Sentence, SentenceChunk, SentenceChunker
 from .token import TokenChunker
 from .word import WordChunker
-from .sentence import Sentence, SentenceChunk, SentenceChunker
-from .semantic import SemanticSentence, SemanticChunk, SemanticChunker
-from .sdpm import SDPMChunker
-
 
 __all__ = [
-    "Chunk", 
+    "Chunk",
     "BaseChunker",
     "TokenChunker",
     "WordChunker",
@@ -17,5 +16,5 @@
     "SemanticSentence",
     "SemanticChunk",
     "SemanticChunker",
-    "SDPMChunker"
-]
+    "SDPMChunker",
+]
diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py
@@ -1,26 +1,29 @@
-from typing import List
-from dataclasses import dataclass
+import importlib
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List
 
-import importlib
 
 @dataclass
 class Chunk:
     """Dataclass representing a text chunk with metadata."""
+
     text: str
     start_index: int
     end_index: int
     token_count: int
 
+
 class BaseChunker(ABC):
     """Abstract base class for all chunker implementations.
-    
+
     All chunker implementations should inherit from this class and implement
     the chunk() method according to their specific chunking strategy.
     """
+
     def __init__(self, tokenizer):
         """Initialize the chunker with a tokenizer.
-        
+
         Args:
             tokenizer: Tokenizer object to be used for tokenizing text
         """
@@ -40,12 +43,13 @@ def _get_tokenizer_backend(self):
             return "tiktoken"
         else:
             raise ValueError("Tokenizer backend not supported")
-    
+
     def _load_tokenizer(self, tokenizer_name: str):
         """Load a tokenizer based on the backend."""
-        try: 
+        try:
             if importlib.util.find_spec("tiktoken") is not None:
                 from tiktoken import get_encoding
+
                 self._tokenizer_backend = "tiktoken"
                 return get_encoding(tokenizer_name)
             else:
@@ -54,27 +58,36 @@ def _load_tokenizer(self, tokenizer_name: str):
             try:
                 if importlib.util.find_spec("autotiktokenizer") is not None:
                     from autotiktokenizer import AutoTikTokenizer
+
                     self._tokenizer_backend = "tiktoken"
                     return AutoTikTokenizer.from_pretrained(tokenizer_name)
                 else:
-                    raise Warning("AutoTikTokenizer library not found. Trying tokenizers.")
+                    raise Warning(
+                        "AutoTikTokenizer library not found. Trying tokenizers."
+                    )
             except Exception:
                 try:
                     if importlib.util.find_spec("tokenizers") is not None:
                         from tokenizers import Tokenizer
+
                         self._tokenizer_backend = "tokenizers"
                         return Tokenizer.from_pretrained(tokenizer_name)
                     else:
-                        raise Warning("Tokenizers library not found. Trying transformers.")
+                        raise Warning(
+                            "Tokenizers library not found. Trying transformers."
+                        )
                 except Exception:
                     try:
                         if importlib.util.find_spec("transformers") is not None:
                             from transformers import AutoTokenizer
-                            self._tokenizer_backend  = "transformers"
+
+                            self._tokenizer_backend = "transformers"
                             return AutoTokenizer.from_pretrained(tokenizer_name)
                     except Exception:
-                        raise ValueError("Tokenizer not found in the following libraries: transformers, tokenizers, autotiktokenizer, tiktoken", 
-                                         "Please install one of these libraries to use the chunker.")    
+                        raise ValueError(
+                            "Tokenizer not found in the following libraries: transformers, tokenizers, autotiktokenizer, tiktoken",
+                            "Please install one of these libraries to use the chunker.",
+                        )
 
     def _encode(self, text: str):
         """Encode text using the backend tokenizer."""
@@ -86,11 +99,11 @@ def _encode(self, text: str):
             return self.tokenizer.encode(text)
         else:
             raise ValueError("Tokenizer backend not supported.")
-    
+
     def _encode_batch(self, texts: List[str]):
         """Encode a batch of texts using the backend tokenizer."""
         if self._tokenizer_backend == "transformers":
-            return self.tokenizer.batch_encode_plus(texts)['input_ids']
+            return self.tokenizer.batch_encode_plus(texts)["input_ids"]
         elif self._tokenizer_backend == "tokenizers":
             return self.tokenizer.encode_batch(texts)
         elif self._tokenizer_backend == "tiktoken":
@@ -108,7 +121,7 @@ def _decode(self, tokens) -> str:
             return self.tokenizer.decode(tokens)
         else:
             raise ValueError("Tokenizer backend not supported.")
-    
+
     def _decode_batch(self, token_lists: List[List[int]]) -> List[str]:
         """Decode a batch of token lists using the backend tokenizer."""
         if self._tokenizer_backend == "transformers":
@@ -119,30 +132,30 @@ def _decode_batch(self, token_lists: List[List[int]]) -> List[str]:
             return [self.tokenizer.decode(tokens) for tokens in token_lists]
         else:
             raise ValueError("Tokenizer backend not supported.")
-    
+
     @abstractmethod
     def chunk(self, text: str) -> List[Chunk]:
         """Split text into chunks according to the implementation strategy.
-        
+
         Args:
             text: Input text to be chunked
-            
+
         Returns:
             List of Chunk objects containing the chunked text and metadata
         """
         pass
 
     def __call__(self, text: str) -> List[Chunk]:
         """Make the chunker callable directly.
-        
+
         Args:
             text: Input text to be chunked
-            
+
         Returns:
             List of Chunk objects containing the chunked text and metadata
         """
         return self.chunk(text)
-    
+
     def __repr__(self) -> str:
         """Return string representation of the chunker."""
-        return f"{self.__class__.__name__}()"
+        return f"{self.__class__.__name__}()"