Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix] Unify dataclasses under a types.py for ease #80

Merged
merged 3 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions src/chonkie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,12 @@

from .chunker import (
BaseChunker,
Chunk,
SDPMChunker,
SemanticChunk,
SemanticChunker,
SemanticSentence,
Sentence,
SentenceChunk,
SentenceChunker,
TokenChunker,
WordChunker,
)
from .context import Context
from .embeddings import (
AutoEmbeddings,
BaseEmbeddings,
Expand All @@ -25,6 +19,14 @@
BaseRefinery,
OverlapRefinery,
)
from .types import (
Chunk,
Context,
SemanticChunk,
SemanticSentence,
Sentence,
SentenceChunk,
)

__version__ = "0.2.1.post1"
__name__ = "chonkie"
Expand Down
13 changes: 5 additions & 8 deletions src/chonkie/chunker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
from .base import BaseChunker, Chunk
"""Module for chunkers."""

from .base import BaseChunker
from .sdpm import SDPMChunker
from .semantic import SemanticChunk, SemanticChunker, SemanticSentence
from .sentence import Sentence, SentenceChunk, SentenceChunker
from .semantic import SemanticChunker
from .sentence import SentenceChunker
from .token import TokenChunker
from .word import WordChunker

__all__ = [
"Chunk",
"BaseChunker",
"TokenChunker",
"WordChunker",
"Sentence",
"SentenceChunk",
"SentenceChunker",
"SemanticSentence",
"SemanticChunk",
"SemanticChunker",
"SDPMChunker",
]
66 changes: 2 additions & 64 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,72 +4,10 @@
import inspect
import warnings
from abc import ABC, abstractmethod
from dataclasses import dataclass
from multiprocessing import Pool, cpu_count
from typing import Any, Callable, List, Optional, Union
from typing import Any, Callable, List, Union

from chonkie.context import Context


@dataclass
class Chunk:
"""Dataclass representing a text chunk with metadata.

All attributes are read-only via slots for performance reasons.

Attributes:
text: The text content of the chunk
start_index: The starting index of the chunk in the original text
end_index: The ending index of the chunk in the original text
token_count: The number of tokens in the chunk
context: The context of the chunk, useful for refinery classes

"""

text: str
start_index: int
end_index: int
token_count: int
context: Optional[Context] = None

def __str__(self) -> str:
"""Return string representation of the chunk."""
return self.text

def __len__(self) -> int:
"""Return the length of the chunk."""
return len(self.text)

def __repr__(self) -> str:
"""Return string representation of the chunk."""
if self.context is not None:
return (
f"Chunk(text={self.text}, start_index={self.start_index}, "
f"end_index={self.end_index}, token_count={self.token_count})"
)
else:
return (
f"Chunk(text={self.text}, start_index={self.start_index}, "
f"end_index={self.end_index}, token_count={self.token_count}, "
f"context={self.context})"
)

def __iter__(self):
"""Return an iterator over the chunk."""
return iter(self.text)

def __getitem__(self, index: int):
"""Return the item at the given index."""
return self.text[index]

def copy(self) -> "Chunk":
"""Return a deep copy of the chunk."""
return Chunk(
text=self.text,
start_index=self.start_index,
end_index=self.end_index,
token_count=self.token_count,
)
from chonkie.types import Chunk


class BaseChunker(ABC):
Expand Down
9 changes: 6 additions & 3 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""Semantic Double Pass Merge chunking using sentence embeddings."""

from typing import Any, List, Union

from .semantic import SemanticChunk, SemanticChunker, Sentence
from chonkie.types import SemanticChunk, Sentence

from .semantic import SemanticChunker


class SDPMChunker(SemanticChunker):
Expand All @@ -23,7 +26,7 @@ class SDPMChunker(SemanticChunker):

Methods:
chunk: Split text into chunks using the SDPM approach.

"""

def __init__(
Expand Down Expand Up @@ -133,7 +136,7 @@ def chunk(self, text: str) -> List[SemanticChunk]:
sentences = self._prepare_sentences(text)
if len(sentences) <= self.min_sentences:
return [self._create_chunk(sentences)]

# Calculate similarity threshold
self.similarity_threshold = self._calculate_similarity_threshold(sentences)

Expand Down
Loading
Loading