Ruby on Rails | Screencasts | Download | Documentation | Weblog | Community | Source

root/trunk/activesupport/test/multibyte_handler_test.rb

Revision 8789, 18.0 kB (checked in by bitsweat, 7 months ago)

Ruby 1.9 compat: declare utf-8 file encoding

Line 
1 # encoding: utf-8
2 require 'abstract_unit'
3
4 if RUBY_VERSION < '1.9'
5
6 $KCODE = 'UTF8'
7
8 class String
9   # Unicode Inspect returns the codepoints of the string in hex
10   def ui
11     "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' '))
12   end unless ''.respond_to?(:ui)
13 end
14
15 module UTF8HandlingTest
16  
17   def common_setup
18     # This is an ASCII string with some russian strings and a ligature. It's nicely calibrated, because
19     # slicing it at some specific bytes will kill your characters if you use standard Ruby routines.
20     # It has both capital and standard letters, so that we can test case conversions easily.
21     # It has 26 characters and 28 when the ligature gets split during normalization.
22     @string =     "Abcd Блå ffi бла бла бла бла"
23     @string_kd =  "Abcd Блå ffi бла бла бла бла"
24     @string_kc =  "Abcd Блå ffi бла бла бла бла"
25     @string_c =   "Abcd Блå ffi бла бла бла бла"
26     @string_d =   "Abcd Блå ffi бла бла бла бла"
27     @bytestring = "\270\236\010\210\245" # Not UTF-8
28    
29     # Characters from the character classes as described in UAX #29
30     @character_from_class = {
31       :l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A,
32       :extend => 0x094D, :n => 0x64
33     }
34   end
35  
36   def test_utf8_recognition
37     assert ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@string),
38       "Should recognize as a valid UTF-8 string"
39     assert !ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@bytestring), "This is bytestring, not UTF-8"
40   end
41  
42   def test_simple_normalization
43     # Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
44     assert_equal [0x915, 0x93c].pack('U*').ui, [0x915, 0x93c].pack('U*').chars.normalize(:c).to_s.ui
45    
46     null_byte_str = "Test\0test"
47    
48     assert_equal '', @handler.normalize(''), "Empty string should not break things"
49     assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kc).ui, "Null byte should remain"
50     assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :c).ui, "Null byte should remain"
51     assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :d).ui, "Null byte should remain"
52     assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kd).ui, "Null byte should remain"
53     assert_equal null_byte_str.ui, @handler.decompose(null_byte_str).ui, "Null byte should remain"
54     assert_equal null_byte_str.ui, @handler.compose(null_byte_str).ui, "Null byte should remain"
55    
56     comp_str = [
57       44,  # LATIN CAPITAL LETTER D
58       307, # COMBINING DOT ABOVE
59       328, # COMBINING OGONEK
60       323 # COMBINING DOT BELOW
61     ].pack("U*")
62     norm_str_KC = [44,105,106,328,323].pack("U*")
63     norm_str_C = [44,307,328,323].pack("U*")
64     norm_str_D = [44,307,110,780,78,769].pack("U*")
65     norm_str_KD = [44,105,106,110,780,78,769].pack("U*")
66    
67     assert_equal norm_str_KC.ui, @handler.normalize(comp_str, :kc).ui, "Should normalize KC"
68     assert_equal norm_str_C.ui, @handler.normalize(comp_str, :c).ui, "Should normalize C"
69     assert_equal norm_str_D.ui, @handler.normalize(comp_str, :d).ui, "Should normalize D"
70     assert_equal norm_str_KD.ui, @handler.normalize(comp_str, :kd).ui, "Should normalize KD"
71    
72     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.normalize(@bytestring) }
73   end
74  
75   # Test for the Public Review Issue #29, bad explanation of composition might lead to a
76   # bad implementation: http://www.unicode.org/review/pr-29.html
77   def test_normalization_C_pri_29
78     [
79       [0x0B47, 0x0300, 0x0B3E],
80       [0x1100, 0x0300, 0x1161]
81     ].map { |c| c.pack('U*') }.each do |c|
82       assert_equal c.ui, @handler.normalize(c, :c).ui, "Composition is implemented incorrectly"
83     end
84   end
85  
86   def test_casefolding
87     simple_str = "abCdef"
88     simple_str_upcase = "ABCDEF"
89     simple_str_downcase = "abcdef"
90    
91     assert_equal '', @handler.downcase(@handler.upcase('')), "Empty string should not break things"
92     assert_equal simple_str_upcase, @handler.upcase(simple_str), "should upcase properly"
93     assert_equal simple_str_downcase, @handler.downcase(simple_str), "should downcase properly"
94     assert_equal simple_str_downcase, @handler.downcase(@handler.upcase(simple_str_downcase)), "upcase and downcase should be mirrors"
95    
96     rus_str = "аБвгЎ\0f"
97     rus_str_upcase = "АБВГД\0F"
98     rus_str_downcase = "абвгЎ\0f"
99     assert_equal rus_str_upcase, @handler.upcase(rus_str), "should upcase properly honoring null-byte"
100     assert_equal rus_str_downcase, @handler.downcase(rus_str), "should downcase properly honoring null-byte"
101    
102     jap_str = "の埋め蟌み化察応はほが完成"
103     assert_equal jap_str, @handler.upcase(jap_str), "Japanse has no upcase, should remain unchanged"
104     assert_equal jap_str, @handler.downcase(jap_str), "Japanse has no downcase, should remain unchanged"
105    
106     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.upcase(@bytestring) }
107   end
108  
109   def test_capitalize
110     { 'аБвг аБвг' => 'Абвг абвг',
111       'аБвг АБВГ' => 'Абвг абвг',
112       'АБВГ АБВГ' => 'Абвг абвг',
113       '' => '' }.each do |f,t|
114         assert_equal t, @handler.capitalize(f), "Capitalize should work as expected"
115     end
116     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.capitalize(@bytestring) }
117   end
118  
119   def test_translate_offset
120     str = "БлaÃ¥" # [2, 2, 1, 2] bytes
121     assert_equal 0, @handler.translate_offset('', 0), "Offset for an empty string makes no sense, return 0"
122     assert_equal 0, @handler.translate_offset(str, 0), "First character, first byte"
123     assert_equal 0, @handler.translate_offset(str, 1), "First character, second byte"
124     assert_equal 1, @handler.translate_offset(str, 2), "Second character, third byte"
125     assert_equal 1, @handler.translate_offset(str, 3), "Second character, fourth byte"
126     assert_equal 2, @handler.translate_offset(str, 4), "Third character, fifth byte"
127     assert_equal 3, @handler.translate_offset(str, 5), "Fourth character, sixth byte"
128     assert_equal 3, @handler.translate_offset(str, 6), "Fourth character, seventh byte"
129     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.translate_offset(@bytestring, 3) }
130   end
131  
132   def test_insert
133     assert_equal '', @handler.insert('', 0, ''), "Empty string should not break things"
134     assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @handler.insert(@string, 10, "БУМ"),
135       "Text should be inserted at right codepoints"
136     assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @string, "Insert should be destructive"
137     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) do
138       @handler.insert(@bytestring, 2, "\210")
139     end
140   end
141  
142   def test_reverse
143     str = "wБлåa \n"
144     rev = "\n aåлБw"
145     assert_equal '', @handler.reverse(''), "Empty string shouldn't change"
146     assert_equal rev.ui, @handler.reverse(str).ui, "Should reverse properly"
147     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.reverse(@bytestring) }
148   end
149  
150   def test_size
151     assert_equal 0, @handler.size(''), "Empty string has size 0"
152     assert_equal 26, @handler.size(@string), "String length should be 26"
153     assert_equal 26, @handler.length(@string), "String length method should be properly aliased"
154     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.size(@bytestring) }
155   end
156  
157   def test_slice
158     assert_equal 0x41, @handler.slice(@string, 0), "Singular characters should return codepoints"
159     assert_equal 0xE5, @handler.slice(@string, 7), "Singular characters should return codepoints"
160     assert_equal nil, @handler.slice('', -1..1), "Broken range should return nil"
161     assert_equal '', @handler.slice('', 0..10), "Empty string should not break things"
162     assert_equal "d Блå ffi", @handler.slice(@string, 3..9), "Unicode characters have to be returned"
163     assert_equal "d Блå ffi", @handler.slice(@string, 3, 7), "Unicode characters have to be returned"
164     assert_equal "A", @handler.slice(@string, 0, 1), "Slicing from an offset should return characters"
165     assert_equal " Блå ffi ", @handler.slice(@string, 4..10), "Unicode characters have to be returned"
166     assert_equal "ffi бла", @handler.slice(@string, /ffi бла/u), "Slicing on Regexps should be supported"
167     assert_equal "ffi бла", @handler.slice(@string, /ffi \w\wа/u), "Slicing on Regexps should be supported"
168     assert_equal nil, @handler.slice(@string, /unknown/u), "Slicing on Regexps with no match should return nil"
169     assert_equal "ffi бла", @handler.slice(@string, /(ffi бла)/u,1), "Slicing on Regexps with a match group should be supported"
170     assert_equal nil, @handler.slice(@string, /(ffi)/u,2), "Slicing with a Regexp and asking for an invalid match group should return nil"
171     assert_equal "", @handler.slice(@string, 7..6), "Range is empty, should return an empty string"
172     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.slice(@bytestring, 2..3) }
173     assert_raise(TypeError, "With 2 args, should raise TypeError for non-Numeric or Regexp first argument") { @handler.slice(@string, 2..3, 1) }
174     assert_raise(TypeError, "With 2 args, should raise TypeError for non-Numeric or Regexp second argument") { @handler.slice(@string, 1, 2..3) }
175     assert_raise(ArgumentError, "Should raise ArgumentError when there are more than 2 args") { @handler.slice(@string, 1, 1, 1) }
176   end
177  
178   def test_grapheme_cluster_length
179     assert_equal 0, @handler.g_length(''), "String should count 0 grapheme clusters"
180     assert_equal 2, @handler.g_length([0x0924, 0x094D, 0x0930].pack('U*')), "String should count 2 grapheme clusters"
181     assert_equal 1, @handler.g_length(string_from_classes(%w(cr lf))), "Don't cut between CR and LF"
182     assert_equal 1, @handler.g_length(string_from_classes(%w(l l))), "Don't cut between L"
183     assert_equal 1, @handler.g_length(string_from_classes(%w(l v))), "Don't cut between L and V"
184     assert_equal 1, @handler.g_length(string_from_classes(%w(l lv))), "Don't cut between L and LV"
185     assert_equal 1, @handler.g_length(string_from_classes(%w(l lvt))), "Don't cut between L and LVT"
186     assert_equal 1, @handler.g_length(string_from_classes(%w(lv v))), "Don't cut between LV and V"
187     assert_equal 1, @handler.g_length(string_from_classes(%w(lv t))), "Don't cut between LV and T"
188     assert_equal 1, @handler.g_length(string_from_classes(%w(v v))), "Don't cut between V and V"
189     assert_equal 1, @handler.g_length(string_from_classes(%w(v t))), "Don't cut between V and T"
190     assert_equal 1, @handler.g_length(string_from_classes(%w(lvt t))), "Don't cut between LVT and T"
191     assert_equal 1, @handler.g_length(string_from_classes(%w(t t))), "Don't cut between T and T"
192     assert_equal 1, @handler.g_length(string_from_classes(%w(n extend))), "Don't cut before Extend"
193     assert_equal 2, @handler.g_length(string_from_classes(%w(n n))), "Cut between normal characters"
194     assert_equal 3, @handler.g_length(string_from_classes(%w(n cr lf n))), "Don't cut between CR and LF"
195     assert_equal 2, @handler.g_length(string_from_classes(%w(n l v t))), "Don't cut between L, V and T"
196     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.g_length(@bytestring) }
197   end
198  
199   def test_index
200      s = "ΚαληΌέρα κόσΌε!"
201      assert_equal 0, @handler.index('', ''), "The empty string is always found at the beginning of the string"
202      assert_equal 0, @handler.index('haystack', ''), "The empty string is always found at the beginning of the string"
203      assert_equal 0, @handler.index(s, 'Κ'), "Greek K is at 0"
204      assert_equal 1, @handler.index(s, 'α'), "Greek Alpha is at 1"
205      
206      assert_equal nil, @handler.index(@bytestring, 'a')
207      assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.index(@bytestring, "\010") }
208   end
209  
210   def test_indexed_insert
211     s = "Καλη!"
212     @handler[s, 2] = "a"
213     assert_equal "Καaη!", s
214     @handler[s, 2] = "ηη"
215     assert_equal "Καηηη!", s
216     assert_raises(IndexError) { @handler[s, 10] = 'a' }
217     assert_equal "Καηηη!", s
218     @handler[s, 2] = 32
219     assert_equal "Κα ηη!", s
220     @handler[s, 3, 2] = "λλλ"
221     assert_equal "Κα λλλ!", s
222     @handler[s, 1, 0] = "λ"
223     assert_equal "Κλα λλλ!", s
224     assert_raises(IndexError) { @handler[s, 10, 4] = 'a' }
225     assert_equal "Κλα λλλ!", s
226     @handler[s, 4..6] = "ηη"
227     assert_equal "Κλα ηη!", s
228     assert_raises(RangeError) { @handler[s, 10..12] = 'a' }
229     assert_equal "Κλα ηη!", s
230     @handler[s, /ηη/] = "λλλ"
231     assert_equal "Κλα λλλ!", s
232     assert_raises(IndexError) { @handler[s, /ii/] = 'a' }
233     assert_equal "Κλα λλλ!", s
234     @handler[s, /(λλ)(.)/, 2] = "α"
235     assert_equal "Κλα λλα!", s
236     assert_raises(IndexError) { @handler[s, /()/, 10] = 'a' }
237     assert_equal "Κλα λλα!", s
238     @handler[s, "α"] = "η"
239     assert_equal "Κλη λλα!", s
240     @handler[s, "λλ"] = "ααα"
241     assert_equal "Κλη αααα!", s
242   end
243  
244   def test_rjust
245     s = "Καη"
246     assert_raises(ArgumentError) { @handler.rjust(s, 10, '') }
247     assert_raises(ArgumentError) { @handler.rjust(s) }
248     assert_equal "Καη", @handler.rjust(s, -3)
249     assert_equal "Καη", @handler.rjust(s, 0)
250     assert_equal "Καη", @handler.rjust(s, 3)
251     assert_equal "  Καη", @handler.rjust(s, 5)
252     assert_equal "    Καη", @handler.rjust(s, 7)
253     assert_equal "----Καη", @handler.rjust(s, 7, '-')
254     assert_equal "ααααΚαη", @handler.rjust(s, 7, 'α')
255     assert_equal "abaΚαη", @handler.rjust(s, 6, 'ab')
256     assert_equal "αηαΚαη", @handler.rjust(s, 6, 'αη')
257   end
258  
259   def test_ljust
260     s = "Καη"
261     assert_raises(ArgumentError) { @handler.ljust(s, 10, '') }
262     assert_raises(ArgumentError) { @handler.ljust(s) }
263     assert_equal "Καη", @handler.ljust(s, -3)
264     assert_equal "Καη", @handler.ljust(s, 0)
265     assert_equal "Καη", @handler.ljust(s, 3)
266     assert_equal "Καη  ", @handler.ljust(s, 5)
267     assert_equal "Καη    ", @handler.ljust(s, 7)
268     assert_equal "Καη----", @handler.ljust(s, 7, '-')
269     assert_equal "Καηαααα", @handler.ljust(s, 7, 'α')
270     assert_equal "Καηaba", @handler.ljust(s, 6, 'ab')
271     assert_equal "Καηαηα", @handler.ljust(s, 6, 'αη')
272   end
273  
274   def test_center
275     s = "Καη"
276     assert_raises(ArgumentError) { @handler.center(s, 10, '') }
277     assert_raises(ArgumentError) { @handler.center(s) }
278     assert_equal "Καη", @handler.center(s, -3)
279     assert_equal "Καη", @handler.center(s, 0)
280     assert_equal "Καη", @handler.center(s, 3)
281     assert_equal "Καη ", @handler.center(s, 4)
282     assert_equal " Καη ", @handler.center(s, 5)
283     assert_equal " Καη  ", @handler.center(s, 6)
284     assert_equal "--Καη--", @handler.center(s, 7, '-')
285     assert_equal "--Καη---", @handler.center(s, 8, '-')
286     assert_equal "ααΚαηαα", @handler.center(s, 7, 'α')
287     assert_equal "ααΚαηααα", @handler.center(s, 8, 'α')
288     assert_equal "aΚαηab", @handler.center(s, 6, 'ab')
289     assert_equal "abΚαηab", @handler.center(s, 7, 'ab')
290     assert_equal "ababΚαηabab", @handler.center(s, 11, 'ab')
291     assert_equal "αΚαηαη", @handler.center(s, 6, 'αη')
292     assert_equal "αηΚαηαη", @handler.center(s, 7, 'αη')
293   end
294  
295   def test_strip
296     # A unicode aware version of strip should strip all 26 types of whitespace. This includes the NO BREAK SPACE
297     # aka BOM (byte order mark). The byte order mark has no place in UTF-8 because it's used to detect LE and BE.
298     b = "\n" + [
299       32, # SPACE
300       8195, # EM SPACE
301       8199, # FIGURE SPACE,
302       8201, # THIN SPACE
303       8202, # HAIR SPACE
304       65279, # NO BREAK SPACE (ZW)
305     ].pack('U*')
306     m = "word блОМ\n\n\n  word"
307     e = [
308     65279, # NO BREAK SPACE (ZW)
309     8201, # THIN SPACE
310     8199, # FIGURE SPACE,     
311     32, # SPACE
312     ].pack('U*')
313     string = b+m+e
314    
315     assert_equal '', @handler.strip(''), "Empty string should stay empty"
316     assert_equal m+e, @handler.lstrip(string), "Whitespace should be gone on the left"
317     assert_equal b+m, @handler.rstrip(string), "Whitespace should be gone on the right"
318     assert_equal m, @handler.strip(string), "Whitespace should be stripped on both sides"
319    
320     bs = "\n   #{@bytestring} \n\n"
321     assert_equal @bytestring, @handler.strip(bs), "Invalid unicode strings should still strip"
322   end
323  
324   def test_tidy_bytes
325     result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
326     assert_equal result, @handler.tidy_bytes(@bytestring)
327     assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a'),
328       'tidy_bytes should leave surrounding characters intact'
329     assert_equal "é#{result}é", @handler.tidy_bytes('é' + @bytestring + 'é'),
330       'tidy_bytes should leave surrounding characters intact'
331     assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') }
332    
333     assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla
334     assert_equal "\xC2\xA9", @handler.tidy_bytes("\xA9") # iso_8859_1: copyright symbol
335     assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote
336     assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro
337     assert_equal "\x00", @handler.tidy_bytes("\x00") # null char