Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

emoji 태그 추가 #167

Merged
merged 7 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ModelGenerator/sj.knlm
Git LFS file not shown
4 changes: 2 additions & 2 deletions ModelGenerator/sj.morph
Git LFS file not shown
4 changes: 2 additions & 2 deletions ModelGenerator/skipbigram.mdl
Git LFS file not shown
16 changes: 9 additions & 7 deletions bindings/java/kr/pe/bab2min/Kiwi.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public static class Match {
hashtag = 1 << 2,
mention = 1 << 3,
serial = 1 << 4,
emoji = 1 << 5,
normalizeCoda = 1 << 16,
joinNounPrefix = 1 << 17,
joinNounSuffix = 1 << 18,
Expand Down Expand Up @@ -48,13 +49,13 @@ public static class POSTag {
vcp = 19, vcn = 20,
sf = 21, sp = 22, ss = 23, sso = 24, ssc = 25, se = 26, so = 27, sw = 28, sb = 29,
sl = 30, sh = 31, sn = 32,
w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37,
jks = 38, jkc = 39, jkg = 40, jko = 41, jkb = 42, jkv = 43, jkq = 44, jx = 45, jc = 46,
ep = 47, ef = 48, ec = 49, etn = 50, etm = 51,
z_coda = 52,
user0 = 53, user1 = 54, user2 = 55, user3 = 56, user4 = 57,
p = 58,
max = 59,
w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37, w_emoji = 38,
jks = 39, jkc = 40, jkg = 41, jko = 42, jkb = 43, jkv = 44, jkq = 45, jx = 46, jc = 47,
ep = 48, ef = 49, ec = 50, etn = 51, etm = 52,
z_coda = 53,
user0 = 54, user1 = 55, user2 = 56, user3 = 57, user4 = 58,
p = 59,
max = 60,
pv = p,
pa = (byte)(p + 1),
irregular = - 128,
Expand Down Expand Up @@ -106,6 +107,7 @@ static String toString(byte tag) {
case w_mention: return "W_MENTION";
case w_hashtag: return "W_HASHTAG";
case w_serial: return "W_SERIAL";
case w_emoji: return "W_EMOJI";
case jks: return "JKS";
case jkc: return "JKC";
case jkg: return "JKG";
Expand Down
3 changes: 2 additions & 1 deletion bindings/java/kr/pe/bab2min/KiwiBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ public static class BuildOption {
integrateAllomorph = 1 << 0,
loadDefaultDict = 1 << 1,
loadTypoDict = 1 << 2,
default_ = integrateAllomorph | loadDefaultDict | loadTypoDict;
loadMultiDict = 1 << 3,
default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict;
}

public static class AnalyzedMorph {
Expand Down
3 changes: 2 additions & 1 deletion include/kiwi/PatternMatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ namespace kiwi
hashtag = 1 << 2, /**< 해시태그 형태의 텍스트(#해시)를 w_hashtag 태그에 매칭한다 */
mention = 1 << 3, /**< 멘션 형태의 텍스트(@멘션)를 w_mention 태그에 매칭한다 */
serial = 1 << 4, /**< 일련 번호 형태의 텍스트를 w_serial 태그에 매칭한다 */
emoji = 1 << 5, /**< 이모지 문자를 w_emoji 태그에 매칭한다 */
normalizeCoda = 1 << 16, /**< 초성체가 앞 어절의 받침에 따라붙은 경우를 정규화하여 매칭한다 */
joinNounPrefix = 1 << 17, /**< 체언접두사(XPN)를 분리하지 않고 합쳐서 매칭한다 */
joinNounSuffix = 1 << 18, /**< 명사파생접미사(XSN)를 분리하지 않고 합쳐서 매칭한다 */
Expand All @@ -25,7 +26,7 @@ namespace kiwi
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
all = url | email | hashtag | mention | serial | zCoda,
all = url | email | hashtag | mention | serial | emoji | zCoda,
allWithNormalizing = all | normalizeCoda,
};

Expand Down
7 changes: 6 additions & 1 deletion include/kiwi/ScriptType.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,5 +241,10 @@ namespace kiwi

const char* getScriptName(ScriptType type);

bool isEmoji(char32_t c0, char32_t c1 = 0);
/**
* @brief Check if the character is an emoji
*
* @return 0 if the character is not an emoji, 1 if c0 is an emoji, 2 if c0 and c1 are combined to form an emoji.
*/
int isEmoji(char32_t c0, char32_t c1 = 0);
}
2 changes: 1 addition & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ namespace kiwi
vcp, vcn,
sf, sp, ss, sso, ssc, se, so, sw, sb,
sl, sh, sn,
w_url, w_email, w_mention, w_hashtag, w_serial,
w_url, w_email, w_mention, w_hashtag, w_serial, w_emoji,
jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc,
ep, ef, ec, etn, etm,
z_coda,
Expand Down
4 changes: 2 additions & 2 deletions include/kiwi/Utils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#pragma once
#pragma once
#include <iostream>
#include <string>
#include <memory>
Expand Down Expand Up @@ -30,7 +30,7 @@ namespace kiwi

inline bool isWebTag(POSTag t)
{
return POSTag::w_url <= t && t <= POSTag::w_hashtag;
return POSTag::w_url <= t && t <= POSTag::w_emoji;
}

POSTag toPOSTag(const std::u16string& tagStr);
Expand Down
2 changes: 1 addition & 1 deletion src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ namespace kiwi

inline void updateTokenInfoScript(TokenInfo& info)
{
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw)) return;
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw || info.tag == POSTag::w_emoji)) return;
if ((info.morph && info.morph->kform && !info.morph->kform->empty())) return;
if (info.str.empty()) return;
char32_t c = info.str[0];
Expand Down
75 changes: 75 additions & 0 deletions src/PatternMatcher.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <kiwi/PatternMatcher.h>
#include <kiwi/Utils.h>
#include <kiwi/ScriptType.h>
#include "pattern.hpp"
#include "StrUtils.h"

using namespace std;
using namespace kiwi;
Expand All @@ -26,6 +28,7 @@ namespace kiwi
size_t testNumeric(const char16_t left, const char16_t* first, const char16_t* last) const;
size_t testSerial(const char16_t* first, const char16_t* last) const;
size_t testAbbr(const char16_t* first, const char16_t* last) const;
size_t testEmoji(const char16_t* first, const char16_t* last) const;

public:
std::pair<size_t, POSTag> match(char16_t left, const char16_t* first, const char16_t* last, Match matchOptions) const;
Expand Down Expand Up @@ -290,6 +293,77 @@ size_t PatternMatcherImpl::testAbbr(const char16_t* first, const char16_t* last)
return b - first;
}

size_t PatternMatcherImpl::testEmoji(const char16_t* first, const char16_t* last) const
{
const char16_t* b = first;
while (b + 1 < last)
{
char32_t c0 = 0, c1 = 0;
const char16_t* b1 = b;
if (isHighSurrogate(*b1))
{
c0 = mergeSurrogate(b1[0], b1[1]);
b1 += 2;
}
else
{
c0 = *b1++;
}

const char16_t* b2 = b1;
if (b2 < last)
{
if (isHighSurrogate(*b2) && b2 + 1 < last)
{
c1 = mergeSurrogate(b2[0], b2[1]);
b2 += 2;
}
else
{
c1 = *b2++;
}
}

auto r = isEmoji(c0, c1);
if (r == 1)
{
b = b1;
}
else if (r == 2)
{
b = b2;
}
else
{
break;
}

if (b == last) return b - first;
if (0xfe00 <= *b && *b <= 0xfe0f) // variation selectors
{
++b;
if (b == last) return b - first;
}
else if (b + 1 < last && isHighSurrogate(b[0]))
{
c1 = mergeSurrogate(b[0], b[1]);
if (0x1f3fb <= c1 && c1 <= 0x1f3ff) // skin color modifier
{
b += 2;
if (b == last) return b - first;
}
}

if (*b == 0x200d) // zero width joiner
{
++b;
continue;
}
break;
}
return b - first;
}

pair<size_t, POSTag> PatternMatcherImpl::match(char16_t left, const char16_t * first, const char16_t * last, Match matchOptions) const
{
size_t size;
Expand All @@ -299,6 +373,7 @@ pair<size_t, POSTag> PatternMatcherImpl::match(char16_t left, const char16_t * f
if (!!(matchOptions & Match::email) && (size = testEmail(first, last))) return make_pair(size, POSTag::w_email);
if (!!(matchOptions & Match::mention) && (size = testMention(first, last))) return make_pair(size, POSTag::w_mention);
if (!!(matchOptions & Match::url) && (size = testUrl(first, last))) return make_pair(size, POSTag::w_url);
if (!!(matchOptions & Match::emoji) && (size = testEmoji(first, last))) return make_pair(size, POSTag::w_emoji);
if ((size = testAbbr(first, last))) return make_pair(size, POSTag::sl);
return make_pair(0, POSTag::unknown);
}
Expand Down
Loading
Loading