-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
9944 lines (5973 loc) · 290 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Text Layout Requirements for the Arabic Script</title>
<script src="https://www.w3.org/Tools/respec/respec-w3c-common" async class="remove" type=
"text/javascript"></script>
<script class="remove" type="text/javascript">
var respecConfig = {
// specification status (e.g. WD, LCWD, WG-NOTE, etc.). If in doubt use ED.
specStatus: "ED",
//publishDate: "2015-07-21",
//previousPublishDate: "2014-12-16",
//previousMaturity: "FPWD",
noRecTrack: true,
shortName: "alreq",
copyrightStart: "2015",
edDraftURI: "https://w3c.github.io/alreq/",
// if this is a LCWD, uncomment and set the end of its review period
// lcEnd: "2009-08-05",
// editors, add as many as you like
// only "name" is required
editors: [
{ name: "Shervin Afshar", mailto: "[email protected]", company: "Netflix" },
{ name: "Behnam Esfahbod", mailto: "[email protected]", company: "Quora/Virgule Typeworks" },
{ name: "Mostafa Hajizadeh", mailto: "[email protected]" },
{ name: "Najib Tounsi", mailto: "[email protected]", company: "W3C" },
],
wg: "Internationalization Working Group",
wgURI: "https://www.w3.org/International/core/",
wgPublicList: "public-i18n-arabic",
bugTracker: {
new: "https://github.com/w3c/alreq/issues",
open: "https://github.com/w3c/alreq/issues",
} ,
otherLinks: [
{
key: "Github",
data: [
{
value: "repository",
href: "https://github.com/w3c/alreq"
}
]
}
],
// URI of the patent status for this WG, for Rec-track documents
// !!!! IMPORTANT !!!!
// This is important for Rec-track documents, do not copy a patent URI from a random
// document unless you know what you're doing. If in doubt ask your friendly neighbourhood
// Team Contact.
wgPatentURI: "https://www.w3.org/2004/01/pp-impl/32113/status",
// !!!! IMPORTANT !!!! MAKE THE ABOVE BLINK IN YOUR HEAD
localBiblio: {
"BIDI": {
"authors": [
"Mark Davis",
"Aharon Lanin",
"Andrew Glass"
],
"href": "http://www.unicode.org/reports/tr9/",
"publisher": "Unicode Consortium",
"status": "Unicode Standard Annex #9",
"title": "Unicode Bidirectional Algorithm",
"id": "BIDI",
},
"UBA-BASICS": {
"authors": [
"Richard Ishida",
],
"href": "https://www.w3.org/International/articles/inline-bidi-markup/uba-basics",
"publisher": "World Wide Web Consortium",
"title": "Unicode Bidirectional Algorithm basics",
"id": "UBA-BASICS",
},
"UNICODE": {
"href": "http://www.unicode.org/versions/latest/",
"publisher": "The Unicode Consortium",
"title": "The Unicode Standard",
"id": "UNICODE"
},
"W3-ARAB-MATH": {
"authors": [
"Azzeddine Lazrek",
"Mustapha Eddahibi",
"Khalid Sami",
"Bruce R. Miller"
],
"href": "https://www.w3.org/TR/arabic-math/",
"publisher": "World Wide Web Consortium",
"title": "Arabic mathematical notation",
"id": "W3-ARAB-MATH"
}
},
};
</script>
<link rel="stylesheet" href="local.css" type="text/css">
</head>
<body>
<section id="abstract">
<p>This document describes requirements for the layout and presentation of text in languages
that use the Arabic script when they are used by Web standards and technologies, such as HTML,
CSS, Mobile Web, Digital Publications, and Unicode.</p>
</section>
<section id="sotd">
<p>This document describes the basic requirements for Arabic script layout and text support on
the Web and in eBooks. These requirements provide information for Web technologies such as CSS,
HTML and digital publications about how to support users of Arabic scripts. Currently the
document focuses on Standard Arabic and Persian.</p>
<p>The editor's draft of this document is being developed by the <a href=
"http://w3c.github.io/alreq/homepage/">Arabic Layout Task Force</a>, part of
the W3C <a href="https://www.w3.org/International/ig/">Internationalization Interest Group</a>.
It is published by the <a href="https://www.w3.org/International/core/">Internationalization
Working Group</a>. The end target for this document is a Working Group Note.</p>
<div class="note" title="Sending Comments on This Document">
<p data-lang="en">If you wish to make comments regarding this document, please raise them as
<a href="https://github.com/w3c/alreq/issues" style="font-size: 120%;">github issues</a>
<!--against the <a href="https://www.w3.org/TR/2015/WD-alreq-TODO/" style="font-size: 120%">latest dated version in /TR</a>-->.
Only send comments by email if you are unable to raise issues on github (see links below).
All comments are welcome.</p>
<p data-lang="en">To make it easier to track comments, please raise separate issues or emails
for each comment, and point to the section you are commenting on using a URL for the
dated version of the document.</p>
</div>
</section>
<section id="h_introduction">
<h2><a href="#h_introduction">Introduction</a>
</h2>
<section id="h_about_this_document">
<h3><a href="#h_about_this_document">About this document</a>
</h3>
<p>The aim of this document is to describe the basic requirements for Arabic script layout and text support on the Web and in eBooks. These requirements provide information for Web technologies such as CSS, HTML and digital publications, and for application developers, about how to support users of Arabic scripts. The document focuses on Standard Arabic and Persian.</p>
<section id="h_gap_analysis">
<h4>Gap Analysis</h4>
<p>This document is pointed to by a separate document, <a href="https://w3c.github.io/alreq/gap-analysis/">Arabic & Persian Gap Analysis</a>, which describes gaps in support for Arabic and Persian on the Web, and prioritises and describes the impact of those gaps on the user.</p>
<p>Wherever an unsupported feature is indentified through the gap analysis process, the requirements for that feature need to be documented. This document is where those requirements are described.</p>
<p>This document should contain no reference to a particular technology. For example, it should not say "CSS does/doesn't do such and such", and it should not describe how a technology, such as CSS, should implement the requirements. It is technology agnostic, so that it will be evergreen, and it simply describes how the script works. The gap analysis document is the appropriate place for all kinds of technology-specific information.</p>
</section>
<section id="h_info_requests">
<h4>Other Related Resources</h4>
<p>The document <a href="https://w3c.github.io/typography/">International text layout and typography index</a> (known informally as the text layout index) points to this document and others, and provides a central location for developers and implementers to find information related to various scripts.</p>
<p>The W3C also maintains a tracking system that has links to github issues in W3C repositories. There are separate links for (a) requests from developers to the user community for information about how scripts/languages work, (b) issues raised against a spec, and (c) browser bugs. For example, you can find out <a href="http://w3c.github.io/i18n-activity/textlayout/?filter=type-info-request">what information developers are currently seeking</a>, and the resulting list can also be filtered by script.</p>
</section>
</section>
<section id="h_languages">
<h3><a href="#h_languages">Languages</a>
</h3>
<p>This document is focused on two languages: Standard Arabic and Persian.</p>
<section id="h_standard_arabic_language">
<h4><a href="#h_standard_arabic_language">Standard Arabic Language</a>
</h4>
<p><dfn>Standard Arabic</dfn>—a.k.a. Modern Standard Arabic or Literary Arabic—is the
standardized and literary variety of Arabic used in writing and in most formal speech in
countries of Northern Africa and West Asia. Regional and classical dialects of Arabic may
differ in layout and text details and are <em>not</em> covered by this document.</p>
<p>However, there are some major differences in common practices between the <dfn>Western
Arab regions</dfn>—that is North-West Africa—and <dfn>Eastern Arab regions</dfn>—which is
North-East Africa and West Asia. For example, the numeral digits used in the two regions
and their formatting are vastly different. Although, there's no clear line between the
Eastern and Western Arab regions.</p>
</section>
<section id="h_persian_language">
<h4><a href="#h_persian_language">Persian Language</a>
</h4>
<p><dfn>Persian</dfn>—a.k.a. Modern Persian—is the standardized and literary variety of the
official languages used in Iran and Afghanistan. The dialect of Persian in Iran is also
called Western Persian, and is locally known as <span class="qterm">Farsi</span>. The
dialect of Persian in Afghanistan is also known as Eastern Persian, and is locally known as
Dari.</p>
<p><dfn>Tajik</dfn>—a.k.a Tajiki or Tajiki Persian—is the Persian language as used in
Tajikistan. It is written in the Cyrillic script, therefore, is <em>not</em> covered by
this document.</p>
</section>
</section>
</section>
<section id="h_arabic_script_overview">
<h2><a href="#h_arabic_script_overview">Arabic Script Overview</a>
</h2>
<section id="h_encoding">
<h3><a href="#h_encoding">Encoding</a>
</h3>
<p>Arabic script is encoded in the Unicode standard <em>semantically</em>, meaning that every
letter receives only a single Unicode character, no matter how many different contextual
shapes it may exhibit.</p>
<p>Unicode also has a partial set of <em>non-semantic</em> encoded characters for the Arabic
script, under blocks <em>Arabic Presentation Forms-A</em> and <em>Arabic Presentation
Forms-B</em>, which are deprecated and should not be used in general interchange.</p>
</section>
<section id="h_characters">
<h3><a href="#h_characters">Characters</a>
</h3>
<p>Appendix <a href="#characters-tables"></a> lists characters used for the Arabic and
Persian languages. Characters used for these languages include letters and diacritics, three
sets of digits (usage depending on the region), punctuation (some common and some specific to
the script), symbols, and Unicode formatting characters.</p>
<p>The majority of these characters are common among different languages. There are three
different sets of digits used by different languages. Most of the alphabetical characters are
used by all the languages using Arabic scripts, but there are exceptions, such as the Arabic
letter <span class="lettername">yeh</span> being represented with two different characters,
<span class="uname">U+064A ARABIC LETTER YEH</span> (<span lang="ar" dir="rtl">ي</span>) and
<span class="uname">U+06CC ARABIC LETTER FARSI YEH</span> (<span lang="ar" dir=
"rtl">ی</span>). These differences among the character sets of each language are marked in
the appendix tables.</p>
<p>Control characters are used to produce the correct spelling of the words or to ensure
correct combination with left-to-right content. Consequently, they should be preserved when
storing and displaying texts.</p>
</section>
<section id="h_direction">
<h3><a href="#h_direction">Direction</a>
</h3>
<p>Arabic script is written from right to left. Numbers, even Arabic numbers, are written
from left to right, as is text in a script that is normally left-to-right.</p>
<p>When the main script is Arabic, the layout and structure of pages and documents are also
set from right to left.</p>
<p><dfn data-lt="bidirectional algorithm|bidi algorithm"><a href=
"http://www.unicode.org/reports/tr9/">Unicode Bidirectional Algorithm</a></dfn> (or
<span class="qterm">bidi algorithm</span>, for short) [[!BIDI]] details an algorithm for
rendering right-to-left text and covers a myriad of situations in mixing different kinds of
characters. A simpler explanation of the basics of the algorithm exists in the W3C article
<a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode
Bidirectional Algorithm basics</a>. [[UBA-BASICS]] You can refer to these documents for more
information about Unicode’s bidirectional algorithm.</p>
<p>A brief overview of the <a>bidirectional algorithm</a> follows, because the direction is
an essential part of how Arabic script is used.</p>
<p>The characters of a text are digitally stored and transferred in the same order that they
are typed by a user. This is the order in which the text is read and pronounced by people and
held in memory by software applications, as shown in <a href="#figure-order_in_memory"></a>
for a sample text.</p>
<figure id="figure-order_in_memory">
<img src="images/order-in-memory.svg" alt="The order of characters in memory">
<figcaption>
The order of characters in memory
</figcaption>
</figure>
<p>But the order used when displaying text is different. The purpose of the bidi algorithm is
to find display positions for the characters of a text. These positions are solely used for
displaying texts. <a href="#figure-order_when_displayed"></a> shows the same sample text when
prepared for display with the bidi algorithm.</p>
<figure id="figure-order_when_displayed">
<img src="images/order-when-displayed.svg" alt="The order of characters when displayed">
<figcaption>
The order of characters when displayed
</figcaption>
</figure>
<p>An initial step of the process involves determining each paragraph’s <span class=
"qterm">base direction</span>: whether the paragraph is left-to-right or right-to-left. The
base direction is either explicitly set by the author, inherited from the page, or (typically
for user-generated content) detected based on the content of the paragraph. The base
direction has two important uses later in the process.</p>
<p>The next step is to split the text into <span class="qterm">directional runs</span>. Each
directional run is a sequence of characters with the same direction.</p>
<figure id="figure-directional_runs">
<img src="images/directional-runs.svg" alt="Splitting a text into 3 directional runs">
<figcaption>
Splitting a text into 3 directional runs
</figcaption>
</figure>
<p>Inside each run, all the characters follow the same order. The runs themselves are ordered
for visual representation from left to right or from right to left, depending on the base
direction of the paragraph. <a href="#figure-order_of_directional_runs"></a> demonstrates an
example of this. This is the first effect of the base direction.</p>
<figure id="figure-order_of_directional_runs">
<img src="images/order-of-directional-runs.svg" alt=
"The effect of base direction on the order of runs">
<figcaption>
The effect of base direction on the order of runs
</figcaption>
</figure>
<p>Unicode has a <span class="qterm">bidi class</span> (or <span class="qterm">bidi
type</span>) property defined for each character that is used to determine the direction of
each character. All the Arabic letters are marked as right-to-left characters, while Latin
characters have the left-to-right category.</p>
<p>Some characters, mostly punctuations, are <span class="qterm">neutral</span>. The
direction of these characters is derived from their surrounding characters. If a neutral
character is surrounded by characters of the same direction (e.g. an space surrounded by
Arabic letters), it gets the direction of its neighbors. Otherwise (e.g. a space between an
Arabic and a Latin, or a neutral character appearing at the start or the end of a paragraph),
the neutral character gets its direction from the paragraph’s base direction. This is another
effect of the base direction in the bidi algorithm.</p>
<p>The above explanation of the bidi algorithm is highly simplified, to convey only the
essentials of how Arabic text is transformed for rendering. The actual algorithm deals with
many more character types and edge cases. Please refer to <a href=
"https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode
Bidirectional Algorithm basics</a> [[UBA-BASICS]] for more information or <a href=
"http://www.unicode.org/reports/tr9/">Unicode Bidirectional Algorithm</a> [[!BIDI]] for the
official detailed documentation.</p>
</section>
<section id="h_joining">
<h3><a href="#h_joining">Joining</a>
</h3>
<p>Arabic script is a cursive writing system; i.e, letters can join to their neighboring
letters. Besides the core behavior of the script, there are some details on how content is
encoded in Unicode, and some rules around joining behavior when rendering special cases.</p>
<section id="h_joining_forms">
<h4><a href="#h_joining_forms">Joining Forms</a>
</h4>
<p>Every Arabic letter has one, two, or four different joining forms, which allow the
letter to join to its neighbors, if applicable. These four forms are:</p>
<ul>
<li><dfn data-lt="isolated">Isolated form</dfn>, used when the letter does not join to
any of the surrounding letters;</li>
<li><dfn data-lt="initial">Initial form</dfn>, used when the letter is joining only to
its next (left-hand side) letter;</li>
<li><dfn data-lt="medial">Medial form</dfn>, used when the letter is joining on both
sides, and</li>
<li><dfn data-lt="final">Final form</dfn>, used when the letter is joined only to its
previous (right-hand side) letter.</li>
</ul>
<p><a href="#figure-letter_meem_shapes"></a> shows samples of all four joining forms for
<span class="uname">U+0645 ARABIC LETTER MEEM</span> (<span dir="rtl" lang=
"ar">م</span>).</p>
<figure id="figure-letter_meem_shapes">
<img height="140" src="images/drawings/joining/joining-meem-isolated.png" alt=
"Isolated joining form of U+0645 ARABIC LETTER MEEM.">
<img height="140" src="images/drawings/joining/joining-meem-final.png" alt=
"Final joining form of U+0645 ARABIC LETTER MEEM.">
<img height="140" src="images/drawings/joining/joining-meem-medial.png" alt=
"Medial joining form of U+0645 ARABIC LETTER MEEM.">
<img height="140" src="images/drawings/joining/joining-meem-initial.png" alt=
"Initial joining form of U+0645 ARABIC LETTER MEEM.">
<figcaption>
Four different letter forms for joining to surrounding letters
</figcaption>
</figure>
<p>We define the following two groups of joining forms:</p>
<ul>
<li><dfn data-lt="join-to-left">Join-to-left forms</dfn>: either Initial form or Medial
form of a letter, which joins to the left-hand side (next) letter. Other forms are called
<dfn data-lt="non-join-to-left form">non-join-to-left</dfn>.</li>
<li><dfn data-lt="join-to-right">Join-to-right forms</dfn>: either Medial form or Final
form of a letter, which joins to the right-hand side (previous) letter. Other forms are
called <dfn data-lt="non-join-to-right form">non-join-to-right</dfn>.</li>
</ul>
</section>
<section id="h_joining_categories">
<h4><a href="#h_joining_categories">Joining Categories</a>
</h4>
<p>There are different categories of letters based on their joining behavior:</p>
<ul>
<li>
<dfn data-lt="dual-joining">Dual-joining letters</dfn>: can join from both sides, like
the letter in <a href="#figure-letter_meem_shapes"></a>, and has all the four shapes
mentioned above.
</li>
<li>
<dfn data-lt="right-joining">Right-joining letters</dfn>: can only join to their
previous (right-hand side) letter, and therefore, only have <a>isolated</a> and
<a>final</a> shapes. <a href="#figure-letter_reh_shapes"></a> shows samples of both
forms for U+0631 ARABIC LETTER REH (ر).
</li>
<li>
<dfn data-lt="non-joining">Non-joining letters</dfn>: cannot join to any surrounding
letter, and therefore can only take the <a>isolated</a> form. <a href=
"#figure-letter_hamzah_shape"></a> shows a sample of U+0621 ARABIC LETTER HAMZAH (ء) in
its only possible form.
</li>
</ul>
<figure id="figure-letter_reh_shapes">
<img height="140" src="images/drawings/joining/joining-reh-isolated.png" alt=
"Isolated joining form of U+0631 ARABIC LETTER REH.">
<img height="140" src="images/drawings/joining/joining-reh-final.png" alt=
"Final joining form of U+0631 ARABIC LETTER REH.">
<figcaption>
<a>Right-joining letters</a> only have two forms of <a>final</a> and <a>isolated</a>.
</figcaption>
</figure>
Most of Arabic letters are either <a>dual-joining</a> or <a>right-joining</a>.
<figure id="figure-letter_hamzah_shape">
<img height="140" src="images/drawings/joining/joining-hamza.png" alt=
"One joining form of U+0621 ARABIC LETTER HAMZAH.">
<figcaption>
<a>Non-Joining letters</a> only have one form: <a>isolated</a>.
</figcaption>
</figure>
</section>
<section id="h_joining_rules">
<h4><a href="#h_joining_rules">Joining Rules</a>
</h4>
<p>There are core rules on how letters join to each other in the Arabic script, which stay
valid regardless of the medium (hand-writing, typewriter, movable-type, digital, etc):</p>
<ol>
<li id="joining_rule_1">Letters of each word join together whenever possible,
implicitly.</li>
<li id="joining_rule_2">In some languages, like Persian and Urdu, there are words—mostly,
but not limited to, compound words—that require explicit breaks in the joining of
letters, although joining would otherwise be possible.</li>
<li id="joining_rule_3">In certain cases, a letter can be in a <a>join-to-left</a> form
without actually connecting to anything on the left, whether there’s any letter or not.
This is often seen in list counters, abbreviations, and other cases where letters do not
have a word context, or are taken out of their original word context.
</li>
<li id="joining_rule_4">In rare cases of words splitting where letters are joined, first
letter of the second half will be in a <a>join-to-right</a> form without any previous
letter. This behavior is limited to special cases like blanking specific letters of a
word, line breaks in a paragraph, and word breaks across poetry verses. No standalone
word can have any letters in <a>join-to-right</a> form without joining on the right-hand
side.
</li>
</ol>
<p><a href="#figure-joining_process"></a> demonstrates how letters join (per Joining Rule
1) to form a word.</p>
<figure id="figure-joining_process">
<img src="images/joining-process.png" alt="Letter BEH and MEEM join to form a word.">
<figcaption>
Letters join by taking their relevant form.
</figcaption>
</figure>
</section>
<section id="h_joining_control">
<h4><a href="#h_joining_control">Joining Control</a>
</h4>
<p>Arabic letters are represented in their intended joining forms in hand-writing,
typewriters, and old (deprecated) digital encodings of the script. In Unicode, letters are
encoded semantically—meaning without any information about their joining form—and therefore
there’s need for a mechanism for controlling of the joining behavior of the letters.</p>
<p>In Unicode, by default, neighbor Arabic letters join together if and only if both
letters are able to join towards the other.</p>
<section id="h_disjoining_enforcement">
<h4><a href="#h_disjoining_enforcement">Disjoining Enforcement</a>
</h4>
<p>As noted in Joining Rule 2, sometimes two Arabic letters sit next to each other (in
one word) which would normally join together, but should not. In Unicode, for such a
case, a special character should be used to enforce disjoining of these letters. This
character is called <span class="uname">U+200C ZERO WIDTH NON-JOINER</span>, or
<dfn>ZWNJ</dfn> for short.</p>
<figure id="figure-disjoining_enforcement">
<img height="140" src="images/drawings/joining/joining-beh-yeh-zwnj-beh-yeh.png" alt=
"ZWNJ example.">
<figcaption>
Example of using <a>ZWNJ</a> for <a href="#h_disjoining_enforcement">disjoining
enforcement</a>.
</figcaption>
</figure>
</section>
<section id="h_joining_enforcement">
<h4><a href="#h_joining_enforcement">Joining Enforcement</a>
</h4>
<p>Similarly, as noted in Joining Rule 4, sometimes an Arabic letter needs to take a
joining form when it would not happen normally. For example, some abbreviation methods us
Initial Form of letters, when possible, for every letter in the abbreviation. Again, in
Unicode, a special character should be used to enforce joining on this letter. This
character is called <span class="uname">U+200D ZERO WIDTH JOINER</span>, or
<dfn>ZWJ</dfn> for short.</p>
<p>Besides <a>ZWJ</a>, there’s another special Unicode character, <span class=
"uname">U+0640 ARABIC TATWEEL</span>, which enforces joining behavior (join causing) on
letters next to it. But, in contrast to <a>ZWJ</a>, <dfn>TATWEEL</dfn> has a glyph shape,
looking like a hyphen and usually as wide as the SPACE glyph, which connects to the
letters on the main joining line (a.k.a. base-line). So, using <a>TATWEEL</a> would give
a similar Joining Enforcement behavior, but has a side effect of wider length for the
letter, which is not always desired. That’s why it’s highly recommended to only use
<a>ZWJ</a> for joining control.</p>
<figure id="figure-joining_enforcement">
<img height="140"
src="images/drawings/joining/joining-heh-zwj-fullstop-sheh-fullstop.png"
alt="ZWJ example.">
<img height="140"
src="images/drawings/joining/joining-heh-tatweel-fullstop-sheh-fullstop.png"
alt="TATWEEL example.">
<figcaption>
Example of using <a>ZWJ</a> (recommended) and <a>TATWEEL</a> (not recommended) for
<a href="#h_joining_enforcement">joining enforcement</a>.
</figcaption>
</figure>
</section>
In Unicode, <a>ZWNJ</a> and <a>ZWJ</a> are called <dfn>Joining Control Characters</dfn>.
<section id="h_joining_disjoining_enforcement">
<h4><a href="#h_joining_disjoining_enforcement">Joining-Disjoining Enforcement</a>
</h4>
<p>Two enforcement methods mentioned above can be combined together to form a
<dfn>Joining-Disjoining Enforcement</dfn> method, that enables <a href=
"#joining_rule_3">Joining Rule 3</a> for cases when there’s a
<a>dual-joining</a>/<a>right-joining</a> letter after a <a>join-to-left</a> letter, which
should not be joined to its previous letter.</p>
<figure id="figure-joining_disjoining_enforcement">
<img height="140"
src="images/drawings/joining/joining-heh-zwj-zwnj-sheh.png"
alt="ZWJ+ZWNJ example.">
<figcaption>
Example of using <span class="qterm"><ZWJ, ZWNJ></span> for
<a>joining-disjoining enforcement</a>.
</figcaption>
</figure>
</section>
<section id="h_context_based_joining">
<h4><a href="#h_context_based_joining">Context-Based Joining</a>
</h4>
<p>Joining Control is not only managed by the content, but sometimes happens by the word
context. For example, a word may be broken between two joined letters because of line
break, meaning the content is not changed and only the joining form of letters should be
maintained across the break.</p>
</section>
</section>
<section id="h_joining_segments">
<h4><a href="#h_joining_segments">Joining Segments</a>
</h4>
<p>A sequence of letters that join together are called a <dfn>Joining Segment</dfn>.
Regardless of language, <dfn>joining segments</dfn> have no direct relationship to
syllables.</p>
<p>Two types of joining segments exist: <dfn data-lt=
"closed joining segment|closed joining segments">closed</dfn> and <dfn data-lt=
"open joining segment|open joining segments">open</dfn>.</p>
<section id="h_closed_joining_segments">
<h4><a href="#h_closed_joining_segments">Closed Joining Segments</a>
</h4>
<p>Joining Segments usually have a closed form, meaning that they start in a
<a>non-join-to-right</a> form and end in a <a>non-join-to-left</a> form. <a>Closed
joining segments</a> are the result of segments either start and end with their normal
behavior (<a href="#joining_rule_1">Joining Rule 1</a>), or by <a href=
"#h_disjoining_enforcement">disjoining enforcement</a> (<a href="#joining_rule_2">Joining
Rule 2</a>).</p>
<p>There are two possible types of closed segments:</p>
<ul>
<li><dfn>Single-Letter Closed Segment</dfn>, which contains only one letter that is in
its Isolated form.</li>
<li><dfn>Multi-Letter Closed Segment</dfn>, which contains more than one letter,
starting with an Initial form, zero or more Medial forms, and ending with a Final
form.</li>
</ul>
</section>
<figure id="figure-closed_joining_segment_example">
<img height="140"
src="images/drawings/joining/joining-multiple-single-letter-closed-segments.png"
alt="A word with only single-letter closed segments.">
<img height="140"
src="images/drawings/joining/joining-single-multi-letter-closed-segment.png"
alt="A word that is just one long multi-letter closed segment.">
<figcaption>
Examples of closed joining segment types.
</figcaption>
</figure>
<section id="h_open_joining_segments">
<h4><a href="#h_open_joining_segments">Open Joining Segments</a>
</h4>
<p>Under the certain cases, as noted in <a href="#joining_rule_3">Joining Rules 3</a>
<a href="#joining_rule_4">and 4</a>, <a>joining segments</a> can start with a
<a>join-to-right</a> form, or end with a <a>join-to-left</a> form, or both.</p>
<p>There are three possible types of these segments:</p>
<ul>
<li><dfn>Open-On-Left Segment</dfn>, which contains one or more Dual-Joining letters,
starting with an Initial form and continuing with zero or more Medial forms.</li>
<li><dfn>Open-On-Right Segment</dfn>, which starts with zero or more Medial Form
letters, and ends with a Final Form letter.</li>
<li><dfn>Open-On-Both-Sides Segment</dfn>, which contains one or more Dual-Joining
letters, all in their Medial Form.</li>
</ul>
</section>
<figure id="figure-open_joining_segment_example">
<img height="140"
src="images/drawings/joining/joining-abbreviation-closed-segments.png"
alt="An abbriviation with closed segments.">
<img height="140"
src="images/drawings/joining/joining-abbreviation-open-on-left-segments.png"
alt="An abbriviation with open-on-left segments.">
<figcaption>
Examples of joining segment types.
</figcaption>
</figure>
</section>
<section id="h_non_joining_characters">
<h4><a href="#h_non_joining_characters">Non-Joining Characters</a>
</h4>
<p>Arabic Letters, two <a>Joining Control Characters</a> (<a>ZWNJ</a> and <a>ZWJ</a>), and
<a>TATWEEL</a> are the only characters used in the Arabic writing system with joining
behavior.</p>
<p>Arabic diacritics, other Unicode <span class="qterm">non-spacing marks</span>, and most
Unicode <span class="qterm">format control characters</span> are considered <dfn data-lt=
"joining transparent">transparent</dfn> in joining behavior.</p>
<p>All other Unicode characters in Arabic script (as well as Latin and many other major
scripts) are non-joining and do not take any joining forms other than Isolated.</p>
<p>For more the details on <span class="qterm">Arabic Cursive Joining algorithm</span>,
please refer to chapter <a href=
"http://www.unicode.org/versions/Unicode9.0.0/ch09.pdf">Middle East-I — Modern and
Liturgical Scripts</a> of The Unicode Standard. [[!UNICODE]]</p>
</section>
</section>
<section id="h_ligatures">
<h3>Ligatures</h3>
<p>Almost all the writing styles of Arabic script use a special shape when letters
<span class="lettername">lam</span> and <span class="lettername">alef</span> are joined. Most
Arabic fonts include mandatory ligatures for this combination. Ignoring this ligature, as
shown in <a href="#figure-laam-alef-ligature"></a>, leads to wrong rendering of text.</p>
<figure id="figure-laam-alef-ligature">
<img src="images/laam-alef-ligature.png" alt=
"Correct and wrong ways of rendering letter lam followed by letter alef">
<figcaption>
Correct and wrong ways of rendering letter <span class="lettername">lam</span> followed
by letter <span class="lettername">alef</span>
</figcaption>
</figure>
<p>This shape is not limited to the combination of <span class="uname">U+0644 ARABIC LETTER
LAM</span> (<span dir="rtl" lang="ar">ل</span>) with <span class="uname">U+0627 ARABIC LETTER
ALEF</span> (<span dir="rtl" lang="ar">ا</span>). Variations of letter <span class=
"lettername">alef</span> such as <span class="uname">U+0622 ARABIC LETTER ALEF WITH MADDA
ABOVE</span> (<span dir="rtl" lang="ar">آ</span>) and <span class="uname">U+0623 ARABIC
LETTER ALEF WITH HAMZA ABOVE</span> (<span dir="rtl" lang="ar">أ</span>) and also variations
of letter <span class="lettername">lam</span> follow the same rules as well. Combination with
diacritics does not affect these ligatures. Each of these ligatures also provides a special
shape for joining from its right side (to the preceding letter).</p>
</section>
<section id="h_diacritics">
<h3><a href="#h_diacritics">Diacritics</a>
</h3>
<p>More than one diacritics can appear after a single character subsequently and all of them
should be applied over the same character. Font files usually define special shapes or
positioning for combination of diacritics. These extra information should be applied in
rendering texts.</p>
<p><a href="#figure-combining_diacritics"></a> shows an example, where, according to this
font’s specification, combining U+0651 ARABIC SHADDA and U+0650 ARABIC KASRA changes their
positions. Various font files may require different transformations.</p>
<figure id="figure-combining_diacritics">
<img src="images/combining-diacritics.png" alt=
"Diacritics could be combined in Arabic script." style="width:90%">
<figcaption>
Diacritics could be combined in Arabic script.
</figcaption>
</figure>
</section>
<section id="h_font_and_typographical_considerations">
<h3><a href="#h_font_and_typographical_considerations">Font and Typographical
considerations</a>
</h3>
<section id="h_arabic_style_and_calligraphy">
<h4><a href="#h_arabic_style_and_calligraphy">Arabic Style and Calligraphy</a>
</h4>
<p>Arabic styling and writing has its origins in Islamic art and civilization, and was
widely used to decorate mosques and palaces, as well as to create beautiful manuscripts and
books, and especially to copy the <em>Korʼan</em>. Arabic script is cursive, making it
viable to support different geometric shapes overlapping and composition. Words can be
written in a very condensed form as well as stretched into elongated shapes, and the
scribes and artists of Islam labored with passion to take advantage of all these
possibilities.</p>
<p>From the beginning of Arabic calligraphy, two tendencies or two types of styles can be
seen emerging: writing for the decoration of mosques and sculptures, which was complex and
highly decorative, and writing styles reserved for writing the <em>Korʼan</em>, which were
easier to use and more readable.</p>
<p>Writing styles then evolved under the influences of cultural diversity, leading to
regional calligraphic schools and styles (<em>Kufi</em> in Iraq, <em>Farsi</em> and
<em>Taʻlīq</em> in Persia, or <em>Diwani</em> in Turkey). Additional differences arose
depending on the purpose of writing, such as the copying and dissemination of the
<em>Korʼan</em>.</p>
<p>In general we group under the generic term <dfn>Naskh</dfn> (copy/inscription) the
scripts which are meant for reading at smaller sizes and are suitable for books and texts
to be read, e.g. the <em>Korʼan</em>, and as <dfn>Kufic</dfn> the highly stylized font
styles used for ornamentation and more styled writings. Nevertheless, the rich evolution of
the Arabic script led to the distinctive enumeration of a number of additional named
styles.</p>
<p>Similarly, two other generic terms are used to classify styles : <em>Mabsut</em> (<em>wa
Mustaqīm</em>) is a form of style that is elongated and straight angled, [which dominated
the copies of <em>Korʼan</em> in eighth and ninth centuries], and <em>Muqawwar</em> (<em>wa
Mudawwar</em>) is a form of style that is curved and rounded.</p>
</section>
<section id="h_different_writing_styles">
<h4><a href="#h_different_writing_styles">Different Writing Styles</a>
</h4>
<p>Basics and principles of Arabic writing were defined by <em>Ibn Moqlah</em> (886-940
Higra), who defined six styles of writing: <em>Kufi</em>, <em>Thuluth</em>, <em>Naskh</em>,
<em>Ruqʻa</em>, <em>Taʻlīq</em> and <em>Diwani</em>.</p>
<dl>
<dt>Kufi (كوفي)</dt>
<dd class="flexContainer">
<figure class="floatedFigure">
<img style="width: 200px; height: 147px;" src="images/kufiExampleQuran.jpg" alt=
"Kufi script">
<figcaption>
Kufi example [<a href=
"https://commons.wikimedia.org/wiki/File:A_section_of_the_Koran_-_Google_Art_Project.jpg">Source</a>].
</figcaption>
</figure>
<p>One of the oldest and best known Arabic scripts. It is characterized by its
decorative and pronounced geometric forms, well adapted for architectural designs. The
style grew with the beginning of Islam to satisfy a need for Muslims to codify the
Korʼan.</p>
</dd>
<dt>Thuluth (ثلث)</dt>
<dd class="flexContainer">
<figure class="floatedFigure">
<img style="width: 132px; height: 173px;" src="images/basmalahThuluth.png" alt=
"Thuluth script">
<figcaption>
Thuluth example [<a href=
"https://commons.wikimedia.org/wiki/File:Basmalah-1wm.png">Source</a>].
</figcaption>
</figure>
<p>(The third.) Recognizable by the fact that the letters and words are highly
interleaved in its complex form. May be the most difficult style to write (requiring a
significant amount of skill), both in terms of its letters and in terms of its
structure and composition.</p>
</dd>
<dt>Naskh (نسخ)</dt>
<dd class="flexContainer">
<figure class="floatedFigure">
<img style="width: 147px; height: 166px;" src="images/naskhQuran2.png" alt=
"Naskh script">
<figcaption>
Nask example [<a href=
"https://commons.wikimedia.org/wiki/File:FirstSurahKoran_%28fragment%29.jpg">Source</a>].
</figcaption>
</figure>
<p>One of the clearest styles of all, with clearly distinguished letters which
facilitate reading and pronunciation. Can be written at small sizes (traditionally
using pens made of reeds and ink), which suits the production of longer texts used for
boards and books intended for the general population, especially the Korʼan.</p>
</dd>