Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Spacy dependency from 'sentence' install + Add FAQ to DOCS.md #21

Merged
merged 9 commits into from
Nov 11, 2024
Prev Previous commit
Next Next commit
Remove sentence_mode parameter from SDPMChunker and related tests for…
… simplification
  • Loading branch information
bhavnicksm committed Nov 11, 2024
commit 3b6618a1f2e9f2d8228dbc107199c25afc5e825b
5 changes: 0 additions & 5 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ def __init__(
similarity_percentile: float = None,
max_chunk_size: int = 512,
initial_sentences: int = 1,
sentence_mode: str = "heuristic",
spacy_model: str = "en_core_web_sm",
skip_window: int = 1, # How many chunks to skip when looking for similarities
):
"""Initialize the SDPMChunker.
Expand All @@ -29,8 +27,6 @@ def __init__(
similarity_threshold=similarity_threshold,
similarity_percentile=similarity_percentile,
initial_sentences=initial_sentences,
sentence_mode=sentence_mode,
spacy_model=spacy_model,
)
self.skip_window = skip_window

Expand Down Expand Up @@ -121,6 +117,5 @@ def __repr__(self) -> str:
f"SPDMChunker(max_chunk_size={self.max_chunk_size}, "
f"{threshold_info}, "
f"initial_sentences={self.initial_sentences}, "
f"sentence_mode='{self.sentence_mode}', "
f"skip_window={self.skip_window})"
)
163 changes: 100 additions & 63 deletions src/chonkie/chunker/semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,19 @@ def __init__(
similarity_threshold: Optional[float] = None,
similarity_percentile: Optional[float] = None,
max_chunk_size: int = 512,
initial_sentences: int = 1,
sentence_mode: str = "heuristic",
spacy_model: str = "en_core_web_sm",
initial_sentences: int = 1
):
"""Initialize the SemanticChunker.

SemanticChunkers split text into semantically coherent chunks using embeddings.

Args:
tokenizer: Tokenizer for counting tokens
embedding_model: Name of the sentence-transformers model to load
max_chunk_size: Maximum tokens allowed per chunk
similarity_threshold: Absolute threshold for semantic similarity (0-1)
similarity_percentile: Percentile threshold for similarity (0-100)
initial_sentences: Number of sentences to start each chunk with
sentence_mode: "heuristic" or "spacy" for sentence splitting
spacy_model: Name of spaCy model to use if sentence_mode="spacy"

Raises:
ValueError: If parameters are invalid
Expand All @@ -76,14 +74,11 @@ def __init__(
raise ValueError(
"Must specify either similarity_threshold or similarity_percentile"
)
if sentence_mode not in ["heuristic", "spacy"]:
raise ValueError("sentence_mode must be 'heuristic' or 'spacy'")

self.max_chunk_size = max_chunk_size
self.similarity_threshold = similarity_threshold
self.similarity_percentile = similarity_percentile
self.initial_sentences = initial_sentences
self.sentence_mode = sentence_mode

# Load sentence-transformers model
self._import_sentence_transformers()
Expand All @@ -94,43 +89,6 @@ def __init__(
else:
self.embedding_model = embedding_model

# Initialize spaCy if explicitly requested
if sentence_mode == "spacy":
self._import_spacy()

if not self.SPACY_AVAILABLE:
raise ImportError(
"spaCy is not installed. Install it with 'pip install spacy' "
"and download the model with 'python -m spacy download en_core_web_sm', "
"or use sentence_mode='heuristic' instead."
)
try:
self.nlp = spacy.load(spacy_model)
except OSError as e:
raise ImportError(
f"spaCy model '{spacy_model}' not found. "
f"Download it with 'python -m spacy download {spacy_model}' "
"or use sentence_mode='heuristic' instead."
) from e

def _import_spacy(self) -> Any:
"""Import spaCy library. Imports mentioned inside the class,
because it takes too long to import the whole library at the beginning of the file.
"""
# Check if spaCy is available
self.SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
if self.SPACY_AVAILABLE:
try:
global spacy
import spacy
except ImportError:
self.SPACY_AVAILABLE = False
warnings.warn(
"Failed to import spaCy despite it being installed. SemanticChunker will not work."
)
else:
warnings.warn("spaCy is not installed. SemanticChunker will not work.")

def _import_sentence_transformers(self) -> Any:
"""Import sentence-transformers library. Imports mentioned inside the class,
because it takes too long to import the whole library at the beginning of the file.
Expand Down Expand Up @@ -164,23 +122,103 @@ def _load_sentence_transformer_model(self, model_name: str) -> Any:
) from e
return model

def _split_sentences_spacy(self, text: str) -> List[str]:
"""Split text into sentences using spaCy."""
doc = self.nlp(text)
return [str(sent).strip() for sent in doc.sents if str(sent).strip()]

def _split_sentences_heuristic(self, text: str) -> List[str]:
"""Split text into sentences using rule-based approach."""
pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*[A-Z])|(?<=[.!?])\s*$"
sentences = re.split(pattern, text)
return [s.strip() for s in sentences if s.strip()]

def _split_sentences(self, text: str) -> List[str]:
"""Split text into sentences using specified mode."""
if self.sentence_mode == "heuristic":
return self._split_sentences_heuristic(text)
else:
return self._split_sentences_spacy(text)
"""Split text into sentences using enhanced regex patterns.

Handles various cases including:
- Standard sentence endings across multiple writing systems
- Quotations and parentheses
- Common abbreviations
- Decimal numbers
- Ellipsis
- Lists and enumerations
- Special punctuation
- Common honorifics and titles

Args:
text: Input text to be split into sentences

Returns:
List of sentences
"""
# Define sentence ending punctuation marks from various writing systems
sent_endings = (
r'[!.?։؟۔܀܁܂߹।॥၊။።፧፨᙮᜵᜶᠃᠉᥄᥅᪨᪩᪪᪫᭚᭛᭞᭟᰻᰼᱾᱿'
r'‼‽⁇⁈⁉⸮⸼꓿꘎꘏꛳꛷꡶꡷꣎꣏꤯꧈꧉꩝꩞꩟꫰꫱꯫﹒﹖﹗!.?𐩖𐩗'
r'𑁇𑁈𑂾𑂿𑃀𑃁𑅁𑅂𑅃𑇅𑇆𑇍𑇞𑇟𑈸𑈹𑈻𑈼𑊩𑑋𑑌𑗂𑗃𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒'
r'𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑜼𑜽𑜾𑩂𑩃𑪛𑪜𑱁𑱂𖩮𖩯𖫵𖬷𖬸𖭄𛲟𝪈。。]'
)

# Common abbreviations and titles that don't end sentences
abbrevs = (
r"(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|viz|al|Gen|Col|Fig|Lt|Mt|St"
r"|etc|approx|appt|apt|dept|est|min|max|misc|no|ps|seq|temp|etal"
r"|e\.g|i\.e|vol|vs|cm|mm|km|kg|lb|ft|pd|hr|sec|min|sq|fx|Feb|Mar"
r"|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)"
)

# First, protect periods in known abbreviations
text = re.sub(rf"({abbrevs})\.", r"\1@POINT@", text, flags=re.IGNORECASE)

# Protect decimal numbers
text = re.sub(r"(\d+)\.(\d+)", r"\1@POINT@\2", text)

# Protect ellipsis
text = re.sub(r"\.{3}", "@ELLIPSIS@", text)

# Protect email addresses and websites
text = re.sub(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", r"@EMAIL@\1@EMAIL@", text)
text = re.sub(r"(https?://[^\s]+)", r"@URL@\1@URL@", text)

# Handle parentheses and brackets
text = re.sub(r'\([^)]*\.[^)]*\)', lambda m: m.group().replace('.', '@POINT@'), text)
text = re.sub(r'\[[^\]]*\.[^\]]*\]', lambda m: m.group().replace('.', '@POINT@'), text)

# Handle quotations with sentence endings
text = re.sub(rf'({sent_endings})"(\s+[A-Z])', r'\1"\n\2', text)

# Handle standard sentence endings
text = re.sub(rf'({sent_endings})(\s+[A-Z"])', r'\1\n\2', text)
text = re.sub(rf'({sent_endings})(\s*$)', r'\1\n', text)

# Handle lists and enumerations
text = re.sub(r'(\d+\.)(\s+[A-Z])', r'\1\n\2', text)
text = re.sub(r'([a-zA-Z]\.)(\s+[A-Z])', r'\1\n\2', text)

# Handle special cases
text = re.sub(rf'({sent_endings})\s*([);:,"])*\s*(\n|\s*$)', r'\1\2\n', text)

# Restore protected periods and symbols
text = text.replace("@POINT@", ".")
text = text.replace("@ELLIPSIS@", "...")
text = re.sub(r'@EMAIL@([^@]+)@EMAIL@', r'\1', text)
text = re.sub(r'@URL@([^@]+)@URL@', r'\1', text)

# Split into sentences and clean up
sentences = [s.strip() for s in text.split('\n') if s.strip()]

# Get token counts for sentences
# token_counts = self._get_token_counts(sentences)

# # Create Sentence objects
# result_sentences = []
# current_pos = 0
# for sent, token_count in zip(sentences, token_counts):
# # Find the actual position in original text
# start_idx = text.find(sent, current_pos)
# end_idx = start_idx + len(sent)
# current_pos = end_idx

# result_sentences.append(
# Sentence(
# text=sent,
# start_index=start_idx,
# end_index=end_idx,
# token_count=token_count
# )
# )

return sentences

def _compute_similarity_threshold(self, all_similarities: List[float]) -> float:
"""Compute similarity threshold based on percentile if specified."""
Expand Down Expand Up @@ -406,6 +444,5 @@ def __repr__(self) -> str:
return (
f"SemanticChunker(max_chunk_size={self.max_chunk_size}, "
f"{threshold_info}, "
f"initial_sentences={self.initial_sentences}, "
f"sentence_mode='{self.sentence_mode}')"
f"initial_sentences={self.initial_sentences})"
)
Loading