Update type annotations and comments

CanCLID · Jul 14, 2024 · cc23aef · cc23aef
1 parent 629e515
commit cc23aef
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -1,26 +1,22 @@
 # CantoneseDetect 粵語特徵分類器
 
-[![license](https://img.shields.io/github/license/DAVFoundation/captain-n3m0.svg?style=flat-square)](https://github.com/DAVFoundation/captain-n3m0/blob/master/LICENSE)
+[![license](https://img.shields.io/github/license/DAVFoundation/captain-n3m0.svg?style=for-the-badge&color=)](https://github.com/DAVFoundation/captain-n3m0/blob/master/LICENSE)
 
 本項目為 [canto-filter](https://github.com/CanCLID/canto-filter) 之後續。canto-filter 得 4 個分類標籤且判斷邏輯更加快速簡單，適合在線快速篩選判別文本或者其他要求低延遲、速度快嘅應用場合。本項目採用更精細嘅判斷邏輯，有 6 個分類標籤，準確度更高，但速度亦會相對 canto-filter 更慢。
 
 This is an extension of the [canto-filter](https://github.com/CanCLID/canto-filter) project. canto-filter has only 4 output labels. It has a simipler classification logic and is faster, more suitable for use cases which require low-latency or high classification speed. This package has 6 output and uses a more sophisticated classification logic for more fine-grained classification. It has higher classification accuracy but slower performance.
 
 ## 引用 Citation
 
-抽出字詞特徵嘅策略同埋實踐方式，喺下面整理。討論本分類器時，請引用：
-
-Chaak-ming Lau, Mingfei Lau, and Ann Wai Huen To. 2024.
-[The Extraction and Fine-grained Classification of Written Cantonese Materials through Linguistic Feature Detection.](https://aclanthology.org/2024.eurali-1.4/)
-In Proceedings of the 2nd Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia (EURALI)
-@ LREC-COLING 2024, pages 24–29, Torino, Italia. ELRA and ICCL.
-
 分類器採用嘅分類標籤及基準，參考咗對使用者嘅語言意識形態嘅研究。討論分類準則時，請引用：
 
 The definitions and boundaries of the labels depend on the user's language ideology.
 When discussing the criteria adopted by this tool, please cite:
 
-Lau, Chaak Ming. 2024. Ideologically driven divergence in Cantonese vernacular writing practices. In J.-F. Dupré, editor, _Politics of Language in Hong Kong_, Routledge.
+> Chaak-ming Lau, Mingfei Lau, and Ann Wai Huen To. 2024.
+> [The Extraction and Fine-grained Classification of Written Cantonese Materials through Linguistic Feature Detection.](https://aclanthology.org/2024.eurali-1.4/)
+> In Proceedings of the 2nd Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia (EURALI)
+> @ LREC-COLING 2024, pages 24–29, Torino, Italia. ELRA and ICCL.
 
 ---
 
@@ -67,10 +63,14 @@ pip install cantonesedetect
 
 可以通過 Python 函數嚟引用，亦可以直接 CLI 調用。
 
+You can call the Python API or this library, or run it directly in CLI.
+
 ### Python
 
 用下面嘅方法創建一個 `Detector`，然後直接調用 `judge()`就可以得到分類結果：
 
+Initialize a `Detector` and call the `judge()` function on inputs, and you will get the classification outputs.
+
 ```python
 from cantonesedetect import CantoneseDetector
 
@@ -87,6 +87,8 @@ detector.judge('那就「是咁的」')  # mixed_quotes_in_swc
 
 如果想要用引號抽取判別、分句判別同埋獲得分析結果，可以：
 
+If you want to judge inputs based on matrix-quote-splitting, or spliting into segments, you can:
+
 ```python
 from cantonesedetect import Detector
 
@@ -112,8 +114,11 @@ print([j.value for j in document_features.document_segments_judgements])
 
 如果直接喺 CLI 調用嘅話，只需要指明`--input`就得。 `--quotes`、`--split`、`--print_analysis`三個參數都默認關閉，如果標明就會打開：
 
+If you run directly in CLI, simply specify the `--input`. The optional arguments `--quotes`、`--split`、`--print_analysis` are all `False` by default, and you can turn them on by specifying them.
+
 ```bash
 cantonesedetect --input input.txt
 # 開啓引號抽取判別、分句判別並且打印分析結果
+# Enable matrix-quotes-splitting, segment-splitting and printing the analysis.
 cantonesedetect --input input.txt --quotes --split --print_analysis
 ```
diff --git a/cantonesedetect/Detector.py b/cantonesedetect/Detector.py
@@ -7,11 +7,11 @@
 import math
 import re
 from collections import Counter
-from typing import List, Tuple, Optional
+from typing import List, Optional, Tuple
 
 from .DocumentFeatures import DocumentFeatures
-from .SegmentFeatures import SegmentFeatures
 from .JudgementTypes import JudgementType
+from .SegmentFeatures import SegmentFeatures
 
 # Cantonese characters not found in SWC
 CANTO_FEATURE_RE = re.compile(
@@ -92,7 +92,7 @@ def __init__(self, split_seg: bool = False, use_quotes: bool = False, get_analys
 
     def _hant_length(self, segment: str) -> int:
         """
-        Return the number of Han characters in a segment.
+        Return the number of Han characters in a segment. Punctuations are excluded.
 
         Args:
             segment (str): The segment of text to be analyzed.
@@ -144,7 +144,7 @@ def _get_segment_features(self, segment: str) -> SegmentFeatures:
 
         return segment_features
 
-    def _judge_single_segment(self, segment: str) -> Tuple[JudgementType, Optional[SegmentFeatures]]:
+    def _judge_single_segment(self, segment: str) -> JudgementType | Tuple[JudgementType, SegmentFeatures]:
         """
         Determine the language of a segment based on the presence of Cantonese and SWC features.
 
@@ -198,13 +198,14 @@ def _judge_single_segment(self, segment: str) -> Tuple[JudgementType, Optional[S
             else:
                 return (JudgementType.MIXED, segment_features) if self.get_analysis else JudgementType.MIXED
 
-    def _judge_segments(self, segments: List[str], document_features: DocumentFeatures = None) -> Tuple[JudgementType, Optional[DocumentFeatures]]:
+    def _judge_segments(self, segments: List[str], document_features: Optional[DocumentFeatures] = None) -> JudgementType | Tuple[JudgementType, DocumentFeatures]:
         """
         Given a list of segments:
         1. If >95% of the segments are Neutral, the overall judgement is Neutral
         2. If Neutral + Cantonese takes up >95%, then overall it is Cantonese
         3. If Neutral + SWC takes up > 95%, then overall it is SWC
         4. Otherwise, it is Mixed.
+        If self.get_analysis is True, return the document features as well.
 
         Args:
             segments (list): A list of segments to be judged.
@@ -252,7 +253,7 @@ def _judge_segments(self, segments: List[str], document_features: DocumentFeatur
         else:
             return (JudgementType.MIXED, document_features) if self.get_analysis else JudgementType.MIXED
 
-    def _judge_document(self, document: str) -> Tuple[JudgementType, Optional[DocumentFeatures]]:
+    def _judge_document(self, document: str) -> JudgementType | Tuple[JudgementType, DocumentFeatures]:
         """
         For an input document, judge based on whether `split_seg` and `get_analysis` are True or False.
 
@@ -264,11 +265,11 @@ def _judge_document(self, document: str) -> Tuple[JudgementType, Optional[Docume
         """
         # Split the document into segments if split_seg is True
         if self.split_seg:
-            segments = filter(lambda x: x.strip(),
-                              ALL_DELIMITERS_RE.split(document))
+            segments: List[str] = filter(lambda x: x.strip(),
+                                         ALL_DELIMITERS_RE.split(document))
         # Otherwise, treat the document as a single segment
         else:
-            segments = [document]
+            segments: List[str] = [document]
 
         if self.get_analysis:
             # Store document features in an object if get_analysis is True
@@ -283,7 +284,7 @@ def _judge_document(self, document: str) -> Tuple[JudgementType, Optional[Docume
             judgement = self._judge_segments(segments)
             return judgement
 
-    def _judge_matrix_quotes(self, document: str) -> Tuple[JudgementType, Optional[DocumentFeatures]]:
+    def _judge_matrix_quotes(self, document: str) -> JudgementType | Tuple[JudgementType, DocumentFeatures]:
         """
         Judge the language of a document with quotes.
 
@@ -348,15 +349,16 @@ def _judge_matrix_quotes(self, document: str) -> Tuple[JudgementType, Optional[D
                 else:
                     return JudgementType.MIXED
 
-    def judge(self, document: str) -> Tuple[JudgementType, Optional[DocumentFeatures]]:
+    def judge(self, document: str) -> JudgementType | Tuple[JudgementType, DocumentFeatures]:
         """
         The only exposed api. Judge the language of a document.
 
         Args:
             document (str): The document to be judged.
 
         Returns:
-            str: The final judgement.
+            JudgementType: The final judgement.
+            (if self.get_analysis) DocumentFeatures: The features of the document if get_analysis is True.
         """
         if self.use_quotes:
             return self._judge_matrix_quotes(document)

diff --git a/cantonesedetect/DocumentFeatures.py b/cantonesedetect/DocumentFeatures.py
@@ -1,24 +1,32 @@
-import sys
 from typing import List
+
 from cantonesedetect.JudgementTypes import JudgementType
 from cantonesedetect.SegmentFeatures import SegmentFeatures
 
 
 class DocumentFeatures:
-    def __init__(self, split_seg, use_quotes):
+    """
+    Store the `SegmentFeatures`s and segment judgements of the document.
+    """
+
+    def __init__(self, split_seg, use_quotes) -> None:
         self.split_seg = split_seg
         self.use_quotes = use_quotes
 
         self.document_segments_features: List[SegmentFeatures] = []
         self.document_segments_judgements: List[JudgementType] = []
 
-    def _merge_judgements_features(self, matrix_judgements: List[JudgementType], quotes_judgements: List[JudgementType], matrix_features: List[SegmentFeatures], quotes_features: List[SegmentFeatures]):
+    def _merge_judgements_features(self, matrix_judgements: List[JudgementType], quotes_judgements: List[JudgementType], matrix_features: List[SegmentFeatures], quotes_features: List[SegmentFeatures]) -> None:
+        """
+        For documents that split the matrix and quotes, the document judgements and features 
+        are merged from those of the matrix and quotes.
+        """
         assert self.use_quotes is True
 
         self.document_segments_features = matrix_features + quotes_features
         self.document_segments_judgements = matrix_judgements + quotes_judgements
 
-    def get_analysis(self):
+    def get_analysis(self) -> str:
         """
         Return a string representation of the document features
         """

diff --git a/cantonesedetect/cli.py b/cantonesedetect/cli.py
@@ -7,7 +7,8 @@
 
 def main():
     """
-    When used as a command line tool, specify input text file with `--input <INPUT.txt>`, and output mode with `--mode <MODE>`.
+    When used as a command line tool, specify input text file with `--input <INPUT.txt>`, 
+    and output mode with `--mode <MODE>`.
     """
     argparser = argparse.ArgumentParser(
         description='Specify input text file with `--input <INPUT.txt>`, where each line is a sentence. ')

diff --git a/cantonesedetect/version.py b/cantonesedetect/version.py
@@ -1 +1 @@
-__version__ = "1.1.0"
+__version__ = "1.1.1"
diff --git a/tests/test_detector.py b/tests/test_detector.py
@@ -15,19 +15,28 @@ def setUp(self):
 
     @pytest.mark.private
     def test_hant_length(self):
+        """
+        Non-Sinograms are not counted.
+        """
         self.assertEqual(self.detector._hant_length("早晨。"), 2)
         self.assertEqual(self.detector._hant_length("Hello，早晨"), 2)
         self.assertEqual(self.detector._hant_length("123 foobar。"), 0)
 
     @pytest.mark.private
     def test_separate_quotes(self):
+        """
+        `_separate_quotes()` should always return 2 strings.
+        """
         document = "一外「一內」二外『二內』三外“三內”。"
         matrix, quotes = self.detector._separate_quotes(document)
         self.assertEqual(matrix, "一外…二外…三外…。")
         self.assertEqual(quotes, "一內…二內…三內")
 
     @pytest.mark.private
     def test_get_segment_features(self):
+        """
+        `_get_segment_features()` should return a `SegmentFeatures` object.
+        """
         segment = "我哋去邊度食飯啊？我們去哪裏吃飯呢？"
         segment_features = self.detector._get_segment_features(segment)
         self.assertEqual(segment_features.canto_feature_count, 2)  # 哋、邊度
@@ -36,6 +45,9 @@ def test_get_segment_features(self):
 
     @pytest.mark.private
     def test_judge_single_segment(self):
+        """
+        `_judge_single_segment()` should return a `JudgementType`, which can be a string.
+        """
         self.assertEqual(self.detector._judge_single_segment(
             "我哋去邊度食飯？"), "cantonese")
         self.assertEqual(
@@ -45,17 +57,26 @@ def test_judge_single_segment(self):
 
     @pytest.mark.private
     def test_judge_segments(self):
+        """
+        `_judge_segments()` should return a `JudgementType`, which can be a string.
+        """
         self.assertEqual(self.detector._judge_segments(
-            "我哋去邊度？我们去哪里？Hello!"), "mixed")
+            ["我哋去邊度？", "我们去哪里？", "Hello!"]), "mixed")
 
     @pytest.mark.private
     def test_judge_matrix_quotes(self):
+        """
+        `_judge_matrix_quotes()` should return a `JudgementType`.
+        """
         self.assertEqual(self.detector._judge_matrix_quotes(
             "他說「係噉嘅」"), JudgementType.CANTONESE_QUOTES_IN_SWC)
         self.assertEqual(self.detector._judge_matrix_quotes(
             "他說「是咁的」"), JudgementType.MIXED_QUOTES_IN_SWC)
 
     def test_judge(self):
+        """
+        `judge()` should return a `JudgementType`.
+        """
         self.assertEqual(self.detector.judge(
             "我哋去邊度？"), JudgementType.CANTONESE)
         self.assertEqual(self.detector.judge("我们去哪里？"), JudgementType.SWC)