Skip to content

Commit

Permalink
Implement PresetConversion
Browse files Browse the repository at this point in the history
  • Loading branch information
ayaka14732 committed Apr 25, 2022
1 parent d2bcf29 commit a26cb9e
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ dist
.mypy_cache

/src/StarCC/dict
/test/testcases
39 changes: 20 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,35 @@

[![Discussion - on Telegram](https://img.shields.io/badge/Discussion-on_Telegram-2ca5e0?logo=telegram)](https://t.me/+jOyC1UnIqZE3OGQ1)

## 用法 Usage
## 安裝 Installation

```sh
pip install starcc
```

## 用法 Usage

不轉換用詞 Without phrase conversion:

```python
from StarCC import Conversion, Dicts
convert = Conversion((Dicts.CN2ST, Dicts.ST2HK)) # change conversion mode here
from StarCC import PresetConversion
convert = PresetConversion(src='cn', dst='hk', with_phrase=False)
print(convert('为什么你在床里面睡着?我们的硅二极管坏了,要去老挝修理。'))
# 為什麼你在牀裏面睡着?我們的硅二極管壞了,要去老撾修理。
```

轉換用詞 With phrase conversion:

```python
from StarCC import PresetConversion
convert = PresetConversion(src='cn', dst='tw', with_phrase=True)
print(convert('为什么你在床里面睡着?我们的硅二极管坏了,要去老挝修理。'))
# 為什麼你在床裡面睡著?我們的矽二極體壞了,要去寮國修理。
```

## 轉換模式一覽 Supported conversion modes

| 源文本<br>From | 目標文本<br>To | 轉換詞彙?<br>Convert Phrases? | 配置<br>Config |
| :-: | :-: | :-: | :-: |
| `zh-CN` | `zh-HK` || `Conversion((Dicts.CN2ST, Dicts.ST2HK))` |
| `zh-CN` | `zh-TW` || `Conversion((Dicts.CN2ST, Dicts.ST2TW))` |
| `zh-CN` | `zh-JP` || `Conversion((Dicts.CN2ST, Dicts.ST2JP))` |
| `zh-HK` | `zh-CN` || `Conversion((Dicts.HK2ST, Dicts.ST2CN))` |
| `zh-HK` | `zh-TW` || `Conversion((Dicts.HK2ST, Dicts.ST2TW))` |
| `zh-HK` | `zh-JP` || `Conversion((Dicts.HK2ST, Dicts.ST2JP))` |
| `zh-TW` | `zh-CN` || `Conversion((Dicts.TW2ST, Dicts.ST2CN))` |
| `zh-TW` | `zh-HK` || `Conversion((Dicts.TW2ST, Dicts.ST2HK))` |
| `zh-TW` | `zh-JP` || `Conversion((Dicts.TW2ST, Dicts.ST2JP))` |
| `zh-JP` | `zh-CN` || `Conversion((Dicts.JP2ST, Dicts.ST2CN))` |
| `zh-JP` | `zh-HK` || `Conversion((Dicts.JP2ST, Dicts.ST2HK))` |
| `zh-JP` | `zh-TW` || `Conversion((Dicts.JP2ST, Dicts.ST2TW))` |
| `zh-CN` | `zh-TW` || `Conversion((Dicts.CN2ST, Dicts.ST2TWP))` |
| `zh-TW` | `zh-CN` || `Conversion((Dicts.TWP2ST, Dicts.ST2CN))` |
- `cn`: Simplified Chinese (Mainland China)
- `hk`: Traditional Chinese (Hong Kong)
- `tw`: Traditional Chinese (Taiwan)
- `jp`: Japanese Shinjitai
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name='starcc',
version='0.0.1',
version='0.0.2',
description='Python implementation of StarCC',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down
44 changes: 44 additions & 0 deletions src/StarCC/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,47 @@ def __call__(self, s: str) -> str:
for trie in self.tries:
s = _convert(trie, s)
return s

class PresetConversion(Conversion):
def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None:
if src not in ('st', 'cn', 'hk', 'tw', 'jp'):
raise ValueError(f'Invalid src value: {src}')
if dst not in ('st', 'cn', 'hk', 'tw', 'jp'):
raise ValueError(f'Invalid dst value: {dst}')
assert src != dst

dicts_list = []

if src != 'st':
if not with_phrase:
dicts_list.append({
'cn': Dicts.CN2ST,
'hk': Dicts.HK2ST,
'tw': Dicts.TW2ST,
'jp': Dicts.JP2ST,
}[src])
else: # with_phrase
if src not in ('cn', 'tw'):
raise ValueError(f'Phrase conversion for {src} is currently not supported')
dicts_list.append({
'cn': Dicts.CN2ST, # CN does not need to convert phrases
'tw': Dicts.TWP2ST,
}[src])

if dst != 'st':
if not with_phrase:
dicts_list.append({
'cn': Dicts.ST2CN,
'hk': Dicts.ST2HK,
'tw': Dicts.ST2TW,
'jp': Dicts.ST2JP,
}[dst])
else: # with_phrase
if src not in ('cn', 'tw'):
raise ValueError(f'Phrase conversion for {src} is currently not supported')
dicts_list.append({
'cn': Dicts.ST2CN, # CN does not need to convert phrases
'tw': Dicts.ST2TWP,
}[dst])

super().__init__(dicts_list)
40 changes: 40 additions & 0 deletions test/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
from os import path
from StarCC import PresetConversion

tests = (
('hk2s', ('hk', 'cn', False)),
('hk2t', ('hk', 'st', False)),
('jp2t', ('jp', 'st', False)),
('s2hk', ('cn', 'hk', False)),
('s2t', ('cn', 'st', False)),
('s2tw', ('cn', 'tw', False)),
('s2twp', ('cn', 'tw', True)),
('t2hk', ('st', 'hk', False)),
('t2jp', ('st', 'jp', False)),
('t2s', ('st', 'cn', False)),
('tw2s', ('tw', 'cn', False)),
('tw2sp', ('tw', 'cn', True)),
('tw2t', ('tw', 'st', False)),
)

if not path.exists('test/testcases'):
os.system('git -C test clone https://github.com/StarCC0/testcases.git')
os.system('git -C test pull')

def run_test(name, config):
with open(f'test/testcases/{name}.in', encoding='utf-8') as f:
xs = f.read()
with open(f'test/testcases/{name}.ans', encoding='utf-8') as f:
ys = f.read()

convert = PresetConversion(*config)
ys_ = convert(xs)

if ys != ys_:
print(f'Error found in {name}\n'
f'Expected: {ys}\n'
f'Got: {ys_}\n\n')

for test in tests:
run_test(*test)

0 comments on commit a26cb9e

Please sign in to comment.