Implement PresetConversion

StarCC0 · Apr 25, 2022 · a26cb9e · a26cb9e
1 parent d2bcf29
commit a26cb9e
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ dist
 .mypy_cache
 
 /src/StarCC/dict
+/test/testcases
diff --git a/README.md b/README.md
@@ -4,34 +4,35 @@
 
 [![Discussion - on Telegram](https://img.shields.io/badge/Discussion-on_Telegram-2ca5e0?logo=telegram)](https://t.me/+jOyC1UnIqZE3OGQ1)
 
-## 用法 Usage
+## 安裝 Installation
 
 ```sh
 pip install starcc
 ```
 
+## 用法 Usage
+
+不轉換用詞 Without phrase conversion:
+
 ```python
-from StarCC import Conversion, Dicts
-convert = Conversion((Dicts.CN2ST, Dicts.ST2HK))  # change conversion mode here
+from StarCC import PresetConversion
+convert = PresetConversion(src='cn', dst='hk', with_phrase=False)
 print(convert('为什么你在床里面睡着？我们的硅二极管坏了，要去老挝修理。'))
 # 為什麼你在牀裏面睡着？我們的硅二極管壞了，要去老撾修理。
 ```
 
+轉換用詞 With phrase conversion:
+
+```python
+from StarCC import PresetConversion
+convert = PresetConversion(src='cn', dst='tw', with_phrase=True)
+print(convert('为什么你在床里面睡着？我们的硅二极管坏了，要去老挝修理。'))
+# 為什麼你在床裡面睡著？我們的矽二極體壞了，要去寮國修理。
+```
+
 ## 轉換模式一覽 Supported conversion modes
 
-| 源文本<br>From | 目標文本<br>To | 轉換詞彙？<br>Convert Phrases? | 配置<br>Config |
-| :-: | :-: | :-: | :-: |
-| `zh-CN` | `zh-HK` | ❌ | `Conversion((Dicts.CN2ST, Dicts.ST2HK))` |
-| `zh-CN` | `zh-TW` | ❌ | `Conversion((Dicts.CN2ST, Dicts.ST2TW))` |
-| `zh-CN` | `zh-JP` | ❌ | `Conversion((Dicts.CN2ST, Dicts.ST2JP))` |
-| `zh-HK` | `zh-CN` | ❌ | `Conversion((Dicts.HK2ST, Dicts.ST2CN))` |
-| `zh-HK` | `zh-TW` | ❌ | `Conversion((Dicts.HK2ST, Dicts.ST2TW))` |
-| `zh-HK` | `zh-JP` | ❌ | `Conversion((Dicts.HK2ST, Dicts.ST2JP))` |
-| `zh-TW` | `zh-CN` | ❌ | `Conversion((Dicts.TW2ST, Dicts.ST2CN))` |
-| `zh-TW` | `zh-HK` | ❌ | `Conversion((Dicts.TW2ST, Dicts.ST2HK))` |
-| `zh-TW` | `zh-JP` | ❌ | `Conversion((Dicts.TW2ST, Dicts.ST2JP))` |
-| `zh-JP` | `zh-CN` | ❌ | `Conversion((Dicts.JP2ST, Dicts.ST2CN))` |
-| `zh-JP` | `zh-HK` | ❌ | `Conversion((Dicts.JP2ST, Dicts.ST2HK))` |
-| `zh-JP` | `zh-TW` | ❌ | `Conversion((Dicts.JP2ST, Dicts.ST2TW))` |
-| `zh-CN` | `zh-TW` | ✅ | `Conversion((Dicts.CN2ST, Dicts.ST2TWP))` |
-| `zh-TW` | `zh-CN` | ✅ | `Conversion((Dicts.TWP2ST, Dicts.ST2CN))` |
+- `cn`: Simplified Chinese (Mainland China)
+- `hk`: Traditional Chinese (Hong Kong)
+- `tw`: Traditional Chinese (Taiwan)
+- `jp`: Japanese Shinjitai
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
 	name='starcc',
-	version='0.0.1',
+	version='0.0.2',
 	description='Python implementation of StarCC',
 	long_description=long_description,
 	long_description_content_type='text/markdown',

diff --git a/src/StarCC/__init__.py b/src/StarCC/__init__.py
@@ -71,3 +71,47 @@ def __call__(self, s: str) -> str:
         for trie in self.tries:
             s = _convert(trie, s)
         return s
+
+class PresetConversion(Conversion):
+    def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None:
+        if src not in ('st', 'cn', 'hk', 'tw', 'jp'):
+            raise ValueError(f'Invalid src value: {src}')
+        if dst not in ('st', 'cn', 'hk', 'tw', 'jp'):
+            raise ValueError(f'Invalid dst value: {dst}')
+        assert src != dst
+
+        dicts_list = []
+
+        if src != 'st':
+            if not with_phrase:
+                dicts_list.append({
+                    'cn': Dicts.CN2ST,
+                    'hk': Dicts.HK2ST,
+                    'tw': Dicts.TW2ST,
+                    'jp': Dicts.JP2ST,
+                }[src])
+            else:  # with_phrase
+                if src not in ('cn', 'tw'):
+                    raise ValueError(f'Phrase conversion for {src} is currently not supported')
+                dicts_list.append({
+                    'cn': Dicts.CN2ST,  # CN does not need to convert phrases
+                    'tw': Dicts.TWP2ST,
+                }[src])
+
+        if dst != 'st':
+            if not with_phrase:
+                dicts_list.append({
+                    'cn': Dicts.ST2CN,
+                    'hk': Dicts.ST2HK,
+                    'tw': Dicts.ST2TW,
+                    'jp': Dicts.ST2JP,
+                }[dst])
+            else:  # with_phrase
+                if src not in ('cn', 'tw'):
+                    raise ValueError(f'Phrase conversion for {src} is currently not supported')
+                dicts_list.append({
+                    'cn': Dicts.ST2CN,  # CN does not need to convert phrases
+                    'tw': Dicts.ST2TWP,
+                }[dst])
+
+        super().__init__(dicts_list)
diff --git a/test/main.py b/test/main.py
@@ -0,0 +1,40 @@
+import os
+from os import path
+from StarCC import PresetConversion
+
+tests = (
+    ('hk2s', ('hk', 'cn', False)),
+    ('hk2t', ('hk', 'st', False)),
+    ('jp2t', ('jp', 'st', False)),
+    ('s2hk', ('cn', 'hk', False)),
+    ('s2t', ('cn', 'st', False)),
+    ('s2tw', ('cn', 'tw', False)),
+    ('s2twp', ('cn', 'tw', True)),
+    ('t2hk', ('st', 'hk', False)),
+    ('t2jp', ('st', 'jp', False)),
+    ('t2s', ('st', 'cn', False)),
+    ('tw2s', ('tw', 'cn', False)),
+    ('tw2sp', ('tw', 'cn', True)),
+    ('tw2t', ('tw', 'st', False)),
+)
+
+if not path.exists('test/testcases'):
+    os.system('git -C test clone https://github.com/StarCC0/testcases.git')
+os.system('git -C test pull')
+
+def run_test(name, config):
+    with open(f'test/testcases/{name}.in', encoding='utf-8') as f:
+        xs = f.read()
+    with open(f'test/testcases/{name}.ans', encoding='utf-8') as f:
+        ys = f.read()
+
+    convert = PresetConversion(*config)
+    ys_ = convert(xs)
+
+    if ys != ys_:
+        print(f'Error found in {name}\n'
+              f'Expected: {ys}\n'
+              f'Got: {ys_}\n\n')
+
+for test in tests:
+    run_test(*test)