metadata {

authority_id: bgnpcgn
id: 2008
language: bal
source_script: Arab
destination_script: Latn
name: ROMANIZATION OF BALUCHI -- BGN/PCGN 2008 System
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693687/ROMANIZATION_OF_BALUCHI.pdf
creation_date: 2008
confirmation_date: 2017-11
description: |
  The following is the BGN/PCGN-approved romanization
  system for deriving standard spellings of Baluchi
  geographic names. The romanization system is based on
  the Hunterian system of romanization, which has been
  used by the Surveys of India and Pakistan for
  romanizing Baluchi geographic names for more than one
  hundred years. The romanization system is compatible
  with all dialects of Baluchi, including Eastern
  Baluchi, Western Baluchi, and Southern Baluchi.

  The BGN/PCGN system laid out below includes diacritical
  marks in order that the original script can be derived
  from the romanized form (i.e. it is reversible). For
  desk users requiring a diacritic-free form, these
  diacritics can simply be removed. In almost every case
  the same basic Roman-script characters are kept as are
  used in the Hunterian system. The BGN/PCGN forms have
  further been designed to harmonize with the BGN/PCGN
  Urdu romanization system. In rigorous romanization
  (i.e. including diacritics), retroflexion is marked by
  a sub-dot, and aspiration is marked by an apostrophe,
  where confusion with fricative digraphs could arise.
  For letters used only in Arabic loan words, the
  rigorous forms have further been designed to harmonize
  with the BGN/PCGN Persian romanization system.

notes:
  - Occasionally, sequences of /z/ or /s/ plus /h/ may be
    encountered, i.e. z·h, s·h. These may be romanized with the
    Unicode 'center dot' (U+00B7) separating the two letters,
    to distinguish them from the digraphs /zh/ and /sh/.

  - The character ة is found very rarely in Baluchi, principally in certain Arabic religious terms, e.g. zakāt
    ('alms'). It should be romanized t.

  - When the letters ال are found, representing the Arabic
    definite article, the ل is assimilated to a following 'sun letter' ,د ,ث ,ت
    ل ,ظ ,ط , ض , ,ص ,ش ,س , ,ر ,ذ or ن and is romanized t, , d, , r, z, s, sh, ş, ẕ ţ z , l, n accordingly.

  - In romanization, the suffixes ءَ (-ā, singular definite)
    and ءِ (-ay, possessive) are connected to the previous word
    by a hyphen, though they are usually written separately.

  - The word for 'and', written as و or ءُ, should be
    romanized as –u-, linked by hyphens to the two words it
    connects; e.g.,
    ہ ٹد و س ٹد → Sind-u-Hind ('The Gangetic Plain').

  - Except as specified in notes 4 and 5, word division in romanization should follow word division in the Baluchi script.

  - Note that the short vowels in the Baluchi examples are not pointed.

  - Certain initial, medial and final characters are not
    readily available in a Unicode-encoded font in a standalone form.

  - The Romanization columns show only lowercase forms but,
    when romanizing, uppercase and lowercase Roman letters as
    appropriate should be used.

}

tests {

# commented tests are blocked by https://github.com/interscript/interscript/issues/620
# 'cultivable patch of riverbed'
test "بےنٹَگ", "Benṭag"
# 'Japan'
test "جاپان", "Jāpān"
test "اَرَبِستان", "Arabistān"
test "بُنجاه", "Bunjāh"
test "بَلوچِستان", "Balochistān"
# 'village'
test "حَلق", "Ḩalq"
# 'foothills or skirts of a mountain'
test "دامان", "Dāmān"
test "ڈاڈَر", "Ḍāḍar"
# 'tomb'
test "گُمبُذ", "Gumbud͟h"
# 'crossroads'
test "چار راہ", "Chār Rāh"
# 'market'
test "بازار", "Bāzār"
test "سےبِى", "Sebī"
# - source: اِيشيا
#   expected: Eshyā
#   # 'homeland'
# - source: وَطَن
#   expected: Waţan
# 'Bandar Abbas'
test "عَبّاس", "‘Abbās"
# 'Taiwan'
test "فارموسا", "Fārmosā"
test "ڈاک", "Ḍāk"
# 'stream, irrigated area, pasture'
test "مَلّ", "Mall"
# - source: ہ یرات
#   expected: Herāt
# 'Philippines'
test "فِلپائِن", "Filpā’in"
test "مُرگاپ", "Murgāp"
# - source: مَرو
#   expected: Marw

}

stage {

# CHARACTERS
parallel {

  # consonant characters

  sub "\u0628", "b" # ب
  sub "\u067E", "p" # پ
  sub "\u062a", "t" # ت
  sub "\u0679", "ṭ" # see note 8 ٹ
  sub "\u067C", "ṭ" # see note 8 ټ
  sub "\u062B", "t͟h" # see note 8 ث
  sub "\u067F", "t͟h" # see note 8 ٿ
  sub "\u062c", "j" # ج
  sub "\u0686", "ch" # ‫چ‬
  sub "\u062d", "ḩ" # ح
  sub "\u062e", "kh" # خ
  sub "\u062f", "d" # د
  sub "\u0688", "ḍ" # ڈ
  sub "\u0689", "ḍ" # ‫ډ‬
  sub "\u0630", "d͟h" # ذ
  sub "\u0631", "r" # ر
  sub "\u0691", "ṛ" # see note 8 ڑ
  sub "\u0693", "ṛ" # see note 8 ړ
  sub "\u0632", "z" # ز
  sub "\u0698", "zh" # ‫ژ‬
  sub "\u0633", "s" # س
  sub "\u0634", "sh" # ش
  sub "\u0635", "ş" # ص
  sub "\u0636", "ẕ" # ض
  sub "\u0637", "ţ" # ط
  sub "\u0638", "z̧" # ظ
  sub "\u0639", "‘" # ع
  sub "\u063a", "gh" # غ
  sub "\u0641", "f" # ف
  sub "\u0642", "q" # ق
  sub "\u0643", "k" # ك
  sub "\u06A9", "k" # ک
  sub "\u06AF", "g" # ‫گ‬
  sub "\u0644", "l" # ل
  sub "\u0645", "m" # م
  sub "\u0646", "n" # ن
  sub "\u06BA", "ñ" # ں
  sub "\u0648", "o" # و

  sub "\u0648", "w" # و
  sub "\u0647", "h" # ه
  sub "\u06C1", "h"
  sub "\u06BE", "h"
  sub "\u0621", "’" # ء
  sub "\u0626", "’" # ئ
  sub "\u0649", "y" # ي
  sub "\u064A", "y" # ي

  # Aspiration is only contrastive in Eastern Baluchi
  sub "\u0628\u06BE", "bh"

  # Aspiration is only contrastive in Eastern Baluchi
  sub "\u067E\u06BE", "ph"

  # Aspiration is only contrastive in Eastern Baluchi.
  # Apostrophe distinguishes from fricative /th/.
  sub "\u062A\u06BE", "th’"

  # Aspiration is only contrastive in Eastern Baluchi
  sub "\u0679\u06BE", "ṭh"

  # Aspiration is only contrastive in Eastern Baluchi
  sub "\u062C\u06BE", "jh"

  # Aspiration is only contrastive in Eastern Baluchi
  sub "\u0686\u06BE", "chh"

  # Aspiration is only contrastive in Eastern Baluchi.
  # Apostrophe distinguishes from fricative /dh/
  sub "\u062D\u06BE", "dh’"

  # Aspiration is only contrastive in Eastern Baluchi
  sub "\u0688\u06BE", "ḍh"

  # Aspiration is only contrastive in Eastern Baluchi
  sub "\u0631\u06BE", "\u1E5B\u0068"

  # Aspiration is only contrastive in Eastern Baluchi.
  # Apostrophe distinguishes from fricative /kh/
  sub "\u06A9\u06BE", "kh’"

  # Aspiration is only contrastive in Eastern Baluchi.
  # Apostrophe distinguishes from fricative /gh/
  sub "\u06AF\u06BE", "gh’" #
  sub "\u0644\u0627", "lā" #
  sub "\u06A9\u0627", "kā" #
  sub "\u06AF\u0627", "gā" #
  sub "\u06A9\u0644", "kl" #
  sub "\u06AF\u0644", "gl" #

  # Vowels, Diphthongs, and Diacritical Marks
  sub "\u0650\u0649", "ī" # ـِي
  sub "\u0650", "i" #  ِ
  sub "\u06D2", "e" # ـے
  sub boundary + "\u0627", "" # ا
  sub "\u0627", "ā" # ا
  sub "\u0622", "ā" # آ
  sub "\u064E", "a" #  َ
  sub "\u064F", "u" #  ُ
  sub "\u064F\u0648", "ū" #  ـُو
  sub "\u064E\u06D2", "ay" #  ـَي
  sub "\u064E\u0648", "aw" # ـَو
  sub "\u0652", "" # Not Romanized
  sub "\u0670", "á" #

  sub "\u0628\u0651", "bb" # ب
  sub "\u067E\u0651", "pp" # پ
  sub "\u062a\u0651", "tt" # ت
  sub "\u0679\u0651", "ṭṭ" # see note 8 ٹ
  sub "\u067C\u0651", "ṭṭ" # see note 8 ټ
  sub "\u062B\u0651", "t͟ht͟h" # see note 8 ث
  sub "\u067F\u0651", "t͟ht͟h" # see note 8 ٿ
  sub "\u062c\u0651", "jj" # ج
  sub "\u0686\u0651", "chch" # ‫چ‬
  sub "\u062d\u0651", "ḩḩ" # ح
  sub "\u062e\u0651", "khkh" # خ
  sub "\u062f\u0651", "dd" # د
  sub "\u0688\u0651", "ḍḍ" # ڈ
  sub "\u0689\u0651", "ḍḍ" # ‫ډ‬
  sub "\u0630\u0651", "d͟hd͟h" # ذ
  sub "\u0631\u0651", "rr" # ر
  sub "\u0691\u0651", "ṛṛ" # see note 8 ڑ
  sub "\u0693\u0651", "ṛṛ" # see note 8 ړ
  sub "\u0632\u0651", "zz" # ز
  sub "\u0698\u0651", "zhzh" # ‫ژ‬
  sub "\u0633\u0651", "ss" # س
  sub "\u0634\u0651", "shsh" # ش
  sub "\u0635\u0651", "şş" # ص
  sub "\u0636\u0651", "ẕẕ" # ض
  sub "\u0637\u0651", "ţţ" # ط
  sub "\u0638\u0651", "z̧z̧" # ظ
  sub "\u0639\u0651", "‘‘" # ع
  sub "\u063a\u0651", "ghgh" # غ
  sub "\u0641\u0651", "ff" # ف
  sub "\u0642\u0651", "qq" # ق
  sub "\u0643\u0651", "kk" # ك
  sub "\u06A9\u0651", "kk" # ک
  sub "\u06AF\u0651", "gg" # ‫گ‬
  sub "\u0644\u0651", "ll" # ل
  sub "\u0645\u0651", "mm" # م
  sub "\u0646\u0651", "nn" # ن
  sub "\u06BA\u0651", "ññ" # ں
  sub "\u0648\u0651", "ww" # و
  sub "\u0647\u0651", "hh" # ه
  sub "\u06C1\u0651", "hh"
  sub "\u06BE\u0651", "hh"
  sub "\u0621\u0651", "’’" # ء
  sub "\u0626\u0651", "’’" # ئ
  sub "\u0649\u0651", "yy" # ي

  sub "\u0621\u064E", "-ā" # see note 4
  sub "\u0621\u0650", "-ay" # see note 4

  # Numerals
  sub "۰", "0"
  sub "۱", "1"
  sub "۲", "2"
  sub "۳", "3"
  sub "۴", "4"
  sub "۵", "5"
  sub "۶", "6"
  sub "۷", "7"
  sub "۸", "8"
  sub "۹", "9"
  # Although Perso-Arabic script is written from right to
  # left, numerical expressions, e.g. ۸۶۹۱ → 1968, are
  # written from left to right. A comma is inserted into
  # longer sequences, either after thousands, millions,     etc.
}

# POSTRULES
sub any("\u0061".."\uFFFF"), upcase, before: boundary, not_before: boundary + any("‘’'")

}