Skip to content

Commit

Permalink
增加新判別詞彙
Browse files Browse the repository at this point in the history
  • Loading branch information
laubonghaudoi committed Apr 8, 2024
1 parent 91f221e commit a372770
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
33 changes: 22 additions & 11 deletions cantofilter/judge.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,38 @@
"""
Core logic:
1. Extract the Cantonese unique words, Mandarin unique words, Mandarin feature words
and Mandarin loan words in the input.
2. Judge whether all Mandarin feature words of the input are Mandarin loan words,
getting `is_all_loan`.
3. Output the classification result based on the containment of Cantonese/Mandarin
unique/feature words.
"""
import re
from typing import List, Tuple

canto_unique = re.compile(
CANTO_UNIQUE = re.compile(
r'[嘅嗰啲咗佢喺咁噉冇啩哋畀嚟諗惗乜嘢閪撚𨳍𨳊瞓睇㗎餸𨋢摷喎嚿噃嚡嘥嗮啱揾搵喐逳噏𢳂岋糴揈捹撳㩒𥄫攰癐冚孻冧𡃁嚫跣𨃩瀡氹嬲掟孭黐唞㪗埞忟𢛴]|' +
r'唔[係得會好識使洗駛通知到去走掂該錯差]|點[樣會做得解]|[琴尋噚聽第]日|[而依]家|家[下陣]|[真就實梗又話都]係|邊[度個位科]|' +
r'[嚇凍攝整揩逢淥浸激][親嚫]|[橫搞傾諗得唔]掂|仲[有係話要得好衰唔]|返[學工去歸]|執[好生實返輸]|' +
r'屋企|收皮|慳錢|傾[偈計]|幫襯|求其|是[但旦]|[濕溼]碎|零舍|肉[赤緊酸]|核突|同埋|勁[秋抽]')
mando_unique = re.compile(r'[這哪您們唄咱啥甭她]|還[是好有]')
MANDO_UNIQUE = re.compile(r'[這哪您們唄咱啥甭她]|還[是好有]')
# “在不” 因為太多融入粵語所以唔喺判別標準內
mando_feature = re.compile(r'[那是的他它吧沒麼么些了卻説說吃弄]|而已')
mando_loan = re.compile(r'亞利桑那|剎那|巴塞羅那|薩那|沙那|哈瓦那|印第安那|那不勒斯|支那|' +
MANDO_FEATURE = re.compile(r'[那是的他它看吧沒麼么些了卻説說吃弄也]|而已')
MANDO_LOAN = re.compile(r'亞利桑那|剎那|巴塞羅那|薩那|沙那|哈瓦那|印第安那|那不勒斯|支那|' +
r'是[否日次非但旦]|[利於]是|唯命是從|頭頭是道|似是而非|自以為是|俯拾皆是|撩是鬥非|莫衷一是|唯才是用|' +
r'[目綠藍紅中]的|的[士確式]|波羅的海|眾矢之的|的而且確|大眼的度|' +
r'[目綠藍紅中飛]的|的[士確式色]|波羅的海|眾矢之的|的而且確|大眼的度|' +
r'些[微少許小]|' +
r'[淹沉浸覆湮埋沒出]沒|沒[落頂收]|神出鬼沒|' +
r'了[結無斷當然哥結得解事之]|[未明]了|不得了|大不了|' +
r'他[信人國日殺鄉]|[其利無排維結]他|馬耳他|他加祿|他山之石|' +
r'其[它]|' +
r'[收查窺觀]看|看[守住好護]|刮目相看|' +

This comment has been minimized.

Copy link
@graphemecluster

graphemecluster Apr 8, 2024

Member

From TypeDuck Lexicon:

                        r'[收查窺觀細且察相回參俯]看|看[守住好護待更台管門望顧牛羊板]|乍看之下|霧[裏裡]看花|走馬看花|看破紅塵|看風使舵|' +
r'[酒網水貼]吧|吧[台臺枱檯]|' +
r'[退忘阻]卻|卻步|' +
r'[遊游小傳解學假淺眾衆訴論][説說]|[說説][話服明]|自圓其[説說]|長話短[說説]|不由分[說説]|' +
r'吃[虧苦力]|' +
r'弄[堂]')
r'弄[堂]|[賣擺嘲]弄|' +
r'可怒也|可惱也|可惱也|如也|也門|之乎者也|天助我也')


def is_within_loan_span(feature_span: Tuple[int, int], loan_spans: List[Tuple[int, int]]) -> bool:
Expand All @@ -47,8 +58,8 @@ def is_all_loan(s: str) -> bool:
判斷一句話入面所有官話特徵係唔係都係借詞
Judge whether all Mandarin features in a sentence are loan words.
'''
mando_features = mando_feature.finditer(s)
mando_loans = mando_loan.finditer(s)
mando_features = MANDO_FEATURE.finditer(s)
mando_loans = MANDO_LOAN.finditer(s)
feature_spans = [m.span() for m in mando_features]
loan_spans = [m.span() for m in mando_loans]

Expand All @@ -70,9 +81,9 @@ def judge(s: str) -> str:
Returns:
str: 粵語、官話、官話溝粵語定係中性 `cantonese`, `mandarin`, `mixed`, or `neutral`.
'''
has_canto_unique = bool(re.search(canto_unique, s))
has_mando_unique = bool(re.search(mando_unique, s))
has_mando_feature = bool(re.search(mando_feature, s))
has_canto_unique = bool(re.search(CANTO_UNIQUE, s))
has_mando_unique = bool(re.search(MANDO_UNIQUE, s))
has_mando_feature = bool(re.search(MANDO_FEATURE, s))

if has_canto_unique:
# 含有粵語成分
Expand Down
9 changes: 5 additions & 4 deletions tests/test_judge.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import unittest
from cantofilter.judge import judge

cantonese = ["你喺邊度","乜你今日唔使返學咩","今日好可能會嚟唔到", "我哋影張相留念"]
mandarin = ["你在哪裏","你想插班的話"]
mixed = ["是咁的","屋企停電的話"]
neutral = ["去學校讀書","做人最重要開心"]
cantonese = ["你喺邊度", "乜你今日唔使返學咩", "今日好可能會嚟唔到", "我哋影張相留念"]
mandarin = ["你在哪裏", "你想插班的話", "家長也應做好家居防蚊措施", "教育不只是為了傳授知識"]
mixed = ["是咁的", "屋企停電的話", "但長遠來講,都係申請息口較低的貸款比較划算"]
neutral = ["去學校讀書", "做人最重要開心",
"外交部駐香港特別行政區特派員公署副特派員", "全日制或大學生於晚市星期一至星期四一天前訂座"]


class TestJudgeFunction(unittest.TestCase):
Expand Down

1 comment on commit a372770

@graphemecluster
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

下次最好去是但一個 lexicon grep 一次先,例如 https://github.com/TypeDuck-HK/schema/blob/master/jyut6ping3.dict.yaml 而家有大概 100 行有「看」字

Please sign in to comment.