module Unicode

Constants

VERSION

Public Class Methods

abbr_categories(p1) click to toggle source
VALUE
unicode_get_abbr_categories(VALUE obj, VALUE str)
{
  WString wstr;
  get_categories_param param = { &wstr, str, catname_abbr };

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));

  return rb_ensure(get_categories_internal, (VALUE)&param,
                   get_categories_ensure, (VALUE)&wstr);
  /* wstr will be freed in get_text_elements_ensure() */
}
capitalize(p1) click to toggle source
static VALUE
unicode_capitalize(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  capitalize_internal(&ustr, &result);
  //sort_canonical(&result);
  WStr_free(&ustr);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
categories(p1) click to toggle source
VALUE
unicode_get_categories(VALUE obj, VALUE str)
{
  WString wstr;
  get_categories_param param = { &wstr, str, catname_long };

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));

  return rb_ensure(get_categories_internal, (VALUE)&param,
                   get_categories_ensure, (VALUE)&wstr);
  /* wstr will be freed in get_text_elements_ensure() */
}
compose(p1) click to toggle source
static VALUE
unicode_compose(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  sort_canonical(&ustr);
  WStr_alloc(&result);
  compose_internal(&ustr, &result);
  WStr_free(&ustr);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
decompose(p1) click to toggle source
static VALUE
unicode_decompose(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
decompose_compat(p1) click to toggle source
static VALUE
unicode_decompose_compat(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_compat_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
decompose_safe(p1) click to toggle source
static VALUE
unicode_decompose_safe(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_safe_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
downcase(p1) click to toggle source
static VALUE
unicode_downcase(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  downcase_internal(&ustr, &result);
  //sort_canonical(&result);
  WStr_free(&ustr);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
nfc(p1) click to toggle source
static VALUE
unicode_normalize_C(VALUE obj, VALUE str)
{
  WString ustr1;
  WString ustr2;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&ustr2);
  decompose_internal(&ustr1, &ustr2);
  WStr_free(&ustr1);
  sort_canonical(&ustr2);
  WStr_alloc(&result);
  compose_internal(&ustr2, &result);
  WStr_free(&ustr2);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
nfc_safe(p1) click to toggle source
static VALUE
unicode_normalize_safe(VALUE obj, VALUE str)
{
  WString ustr1;
  WString ustr2;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&ustr2);
  decompose_safe_internal(&ustr1, &ustr2);
  WStr_free(&ustr1);
  sort_canonical(&ustr2);
  WStr_alloc(&result);
  compose_internal(&ustr2, &result);
  WStr_free(&ustr2);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
nfd(p1) click to toggle source
static VALUE
unicode_decompose(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
nfd_safe(p1) click to toggle source
static VALUE
unicode_decompose_safe(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_safe_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
nfkc(p1) click to toggle source
static VALUE
unicode_normalize_KC(VALUE obj, VALUE str)
{
  WString ustr1;
  WString ustr2;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&ustr2);
  decompose_compat_internal(&ustr1, &ustr2);
  WStr_free(&ustr1);
  sort_canonical(&ustr2);
  WStr_alloc(&result);
  compose_internal(&ustr2, &result);
  WStr_free(&ustr2);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
nfkd(p1) click to toggle source
static VALUE
unicode_decompose_compat(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_compat_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
normalize_C(p1) click to toggle source
static VALUE
unicode_normalize_C(VALUE obj, VALUE str)
{
  WString ustr1;
  WString ustr2;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&ustr2);
  decompose_internal(&ustr1, &ustr2);
  WStr_free(&ustr1);
  sort_canonical(&ustr2);
  WStr_alloc(&result);
  compose_internal(&ustr2, &result);
  WStr_free(&ustr2);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
normalize_C_safe(p1) click to toggle source
static VALUE
unicode_normalize_safe(VALUE obj, VALUE str)
{
  WString ustr1;
  WString ustr2;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&ustr2);
  decompose_safe_internal(&ustr1, &ustr2);
  WStr_free(&ustr1);
  sort_canonical(&ustr2);
  WStr_alloc(&result);
  compose_internal(&ustr2, &result);
  WStr_free(&ustr2);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
normalize_D(p1) click to toggle source
static VALUE
unicode_decompose(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
normalize_D_safe(p1) click to toggle source
static VALUE
unicode_decompose_safe(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_safe_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
normalize_KC(p1) click to toggle source
static VALUE
unicode_normalize_KC(VALUE obj, VALUE str)
{
  WString ustr1;
  WString ustr2;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&ustr2);
  decompose_compat_internal(&ustr1, &ustr2);
  WStr_free(&ustr1);
  sort_canonical(&ustr2);
  WStr_alloc(&result);
  compose_internal(&ustr2, &result);
  WStr_free(&ustr2);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
normalize_KD(p1) click to toggle source
static VALUE
unicode_decompose_compat(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  decompose_compat_internal(&ustr, &result);
  WStr_free(&ustr);
  sort_canonical(&result);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
strcmp(p1, p2) click to toggle source
static VALUE
unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
{
  WString wstr1;
  WString wstr2;
  WString result1;
  WString result2;
  UString ustr1;
  UString ustr2;
  int ret;

  Check_Type(str1, T_STRING);
  Check_Type(str2, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str1);
  CONVERT_TO_UTF8(str2);
#endif
  WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
  WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
  WStr_alloc(&result1);
  WStr_alloc(&result2);
  decompose_internal(&wstr1, &result1);
  decompose_internal(&wstr2, &result2);
  WStr_free(&wstr1);
  WStr_free(&wstr2);
  sort_canonical(&result1);
  sort_canonical(&result2);
  UniStr_alloc(&ustr1);
  UniStr_alloc(&ustr2);
  WStr_convertIntoUString(&result1, &ustr1);
  WStr_convertIntoUString(&result2, &ustr2);
  WStr_free(&result1);
  WStr_free(&result2);
  UniStr_addChar(&ustr1, '\0');
  UniStr_addChar(&ustr2, '\0');
  ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
  UniStr_free(&ustr1);
  UniStr_free(&ustr2);

  return INT2FIX(ret);
}
strcmp_compat(p1, p2) click to toggle source
static VALUE
unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
{
  WString wstr1;
  WString wstr2;
  WString result1;
  WString result2;
  UString ustr1;
  UString ustr2;
  int ret;

  Check_Type(str1, T_STRING);
  Check_Type(str2, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str1);
  CONVERT_TO_UTF8(str2);
#endif
  WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
  WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
  WStr_alloc(&result1);
  WStr_alloc(&result2);
  decompose_compat_internal(&wstr1, &result1);
  decompose_compat_internal(&wstr2, &result2);
  WStr_free(&wstr1);
  WStr_free(&wstr2);
  sort_canonical(&result1);
  sort_canonical(&result2);
  UniStr_alloc(&ustr1);
  UniStr_alloc(&ustr2);
  WStr_convertIntoUString(&result1, &ustr1);
  WStr_convertIntoUString(&result2, &ustr2);
  WStr_free(&result1);
  WStr_free(&result2);
  UniStr_addChar(&ustr1, '\0');
  UniStr_addChar(&ustr2, '\0');
  ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
  UniStr_free(&ustr1);
  UniStr_free(&ustr2);

  return INT2FIX(ret);
}
text_elements(p1) click to toggle source
VALUE
unicode_get_text_elements(VALUE obj, VALUE str)
{
  WString wstr;
  get_text_elements_param param = { &wstr, str };

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));

  return rb_ensure(get_text_elements_internal, (VALUE)&param,
                   get_text_elements_ensure, (VALUE)&wstr);
  /* wstr will be freed in get_text_elements_ensure() */
}
upcase(p1) click to toggle source
static VALUE
unicode_upcase(VALUE obj, VALUE str)
{
  WString ustr;
  WString result;
  UString ret;
  VALUE vret;

  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
  WStr_alloc(&result);
  upcase_internal(&ustr, &result);
  //sort_canonical(&result);
  WStr_free(&ustr);
  UniStr_alloc(&ret);
  WStr_convertIntoUString(&result, &ret);
  WStr_free(&result);
  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
  UniStr_free(&ret);

  return vret;
}
width(p1, p2 = v2) click to toggle source
VALUE
unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
{
  WString wstr;
  int i, count;
  int width = 0;
  int cjk_p = 0;
  VALUE str;
  VALUE cjk;

  count = rb_scan_args(argc, argv, "11", &str, &cjk);
  if (count > 1)
    cjk_p = RTEST(cjk);
  Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
  CONVERT_TO_UTF8(str);
#endif
  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
  for (i = 0; i <wstr.len; i++) {
    int c = wstr.str[i];
    int cat = get_gencat(c);
    int eaw = get_eawidth(c);
    if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
      /* Control Characters */
      width = -1;
      break;
    }
    else if (c != 0x00ad && /* SOFT HYPHEN */
             (cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
              cat == c_Cf || /* Format */
              c == 0 || /* NUL */
              (c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
      /* zero width */ ;
    else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
             (c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
             (c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
             (c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
             (c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
             (c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
             (c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
             (c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
             (c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
             (c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
             (cjk_p && eaw == w_A)) /* East Asian Ambiguous */
      width += 2;
    else
      width++; /* Halfwidth or Neutral */
  }
  WStr_free(&wstr);

  return INT2FIX(width);
}