Ticket #5396: updated_multibyte_activesupport.diff
| File updated_multibyte_activesupport.diff, 59.0 kB (added by manfred, 4 years ago) |
|---|
-
activesupport/test/multibyte_conformance.rb
old new 1 require File.dirname(__FILE__) + '/abstract_unit' 2 require 'open-uri' 3 4 $KCODE = 'UTF8' 5 UNIDATA_VERSION = "5.0.0" 6 UNIDATA_URL = "http://www.unicode.org/Public/#{UNIDATA_VERSION}/ucd" 7 UNIDATA_FILE = '/NormalizationTest.txt' 8 CACHE_DIR = File.dirname(__FILE__) + '/cache' 9 10 class Downloader 11 def self.download(from, to) 12 unless File.exist?(to) 13 $stderr.puts "Downloading #{from} to #{to}" 14 open(from) do |source| 15 File.open(to, 'w') do |target| 16 source.each_line do |l| 17 target.write l 18 end 19 end 20 end 21 end 22 end 23 end 24 25 class String 26 # Unicode Inspect returns the codepoints of the string in hex 27 def ui 28 "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' ')) 29 end unless ''.respond_to?(:ui) 30 end 31 32 Dir.mkdir(CACHE_DIR) unless File.exists?(CACHE_DIR) 33 Downloader.download(UNIDATA_URL + UNIDATA_FILE, CACHE_DIR + UNIDATA_FILE) 34 35 module ConformanceTest 36 def test_normalizations_C 37 each_line_of_norm_tests do |*cols| 38 col1, col2, col3, col4, col5, comment = *cols 39 40 # CONFORMANCE: 41 # 1. The following invariants must be true for all conformant implementations 42 # 43 # NFC 44 # c2 == NFC(c1) == NFC(c2) == NFC(c3) 45 assert_equal col2.ui, @handler.normalize(col1, :c).ui, "Form C - Col 2 has to be NFC(1) - #{comment}" 46 assert_equal col2.ui, @handler.normalize(col2, :c).ui, "Form C - Col 2 has to be NFC(2) - #{comment}" 47 assert_equal col2.ui, @handler.normalize(col3, :c).ui, "Form C - Col 2 has to be NFC(3) - #{comment}" 48 # 49 # c4 == NFC(c4) == NFC(c5) 50 assert_equal col4.ui, @handler.normalize(col4, :c).ui, "Form C - Col 4 has to be C(4) - #{comment}" 51 assert_equal col4.ui, @handler.normalize(col5, :c).ui, "Form C - Col 4 has to be C(5) - #{comment}" 52 end 53 end 54 55 def test_normalizations_D 56 each_line_of_norm_tests do |*cols| 57 col1, col2, col3, col4, col5, comment = *cols 58 # 59 # NFD 60 # c3 == NFD(c1) == NFD(c2) == NFD(c3) 61 assert_equal col3.ui, @handler.normalize(col1, :d).ui, "Form D - Col 3 has to be NFD(1) - #{comment}" 62 assert_equal col3.ui, @handler.normalize(col2, :d).ui, "Form D - Col 3 has to be NFD(2) - #{comment}" 63 assert_equal col3.ui, @handler.normalize(col3, :d).ui, "Form D - Col 3 has to be NFD(3) - #{comment}" 64 # c5 == NFD(c4) == NFD(c5) 65 assert_equal col5.ui, @handler.normalize(col4, :d).ui, "Form D - Col 5 has to be NFD(4) - #{comment}" 66 assert_equal col5.ui, @handler.normalize(col5, :d).ui, "Form D - Col 5 has to be NFD(5) - #{comment}" 67 end 68 end 69 70 def test_normalizations_KC 71 each_line_of_norm_tests do | *cols | 72 col1, col2, col3, col4, col5, comment = *cols 73 # 74 # NFKC 75 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) 76 assert_equal col4.ui, @handler.normalize(col1, :kc).ui, "Form D - Col 4 has to be NFKC(1) - #{comment}" 77 assert_equal col4.ui, @handler.normalize(col2, :kc).ui, "Form D - Col 4 has to be NFKC(2) - #{comment}" 78 assert_equal col4.ui, @handler.normalize(col3, :kc).ui, "Form D - Col 4 has to be NFKC(3) - #{comment}" 79 assert_equal col4.ui, @handler.normalize(col4, :kc).ui, "Form D - Col 4 has to be NFKC(4) - #{comment}" 80 assert_equal col4.ui, @handler.normalize(col5, :kc).ui, "Form D - Col 4 has to be NFKC(5) - #{comment}" 81 end 82 end 83 84 def test_normalizations_KD 85 each_line_of_norm_tests do | *cols | 86 col1, col2, col3, col4, col5, comment = *cols 87 # 88 # NFKD 89 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) 90 assert_equal col5.ui, @handler.normalize(col1, :kd).ui, "Form KD - Col 5 has to be NFKD(1) - #{comment}" 91 assert_equal col5.ui, @handler.normalize(col2, :kd).ui, "Form KD - Col 5 has to be NFKD(2) - #{comment}" 92 assert_equal col5.ui, @handler.normalize(col3, :kd).ui, "Form KD - Col 5 has to be NFKD(3) - #{comment}" 93 assert_equal col5.ui, @handler.normalize(col4, :kd).ui, "Form KD - Col 5 has to be NFKD(4) - #{comment}" 94 assert_equal col5.ui, @handler.normalize(col5, :kd).ui, "Form KD - Col 5 has to be NFKD(5) - #{comment}" 95 end 96 end 97 98 protected 99 def each_line_of_norm_tests(&block) 100 lines = 0 101 max_test_lines = 0 # Don't limit below 38, because that's the header of the testfile 102 File.open(File.dirname(__FILE__) + '/cache' + UNIDATA_FILE, 'r') do | f | 103 until f.eof? || (max_test_lines > 38 and lines > max_test_lines) 104 lines += 1 105 line = f.gets.chomp! 106 next if (line.empty? || line =~ /^\#/) 107 108 cols, comment = line.split("#") 109 cols = cols.split(";").map{|e| e.strip}.reject{|e| e.empty? } 110 next unless cols.length == 5 111 112 # codepoints are in hex in the test suite, pack wants them as integers 113 cols.map!{|c| c.split.map{|codepoint| codepoint.to_i(16)}.pack("U*") } 114 cols << comment 115 116 yield(*cols) 117 end 118 end 119 end 120 end 121 122 begin 123 require_library_or_gem('utf8proc_native') 124 require 'active_record/multibyte/handlers/utf8_handler_proc' 125 class ConformanceTestProc < Test::Unit::TestCase 126 include ConformanceTest 127 def setup 128 @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc 129 end 130 end 131 rescue LoadError 132 end 133 134 class ConformanceTestPure < Test::Unit::TestCase 135 include ConformanceTest 136 def setup 137 @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler 138 end 139 end -
activesupport/test/multibyte_chars_test.rb
old new 1 require File.dirname(__FILE__) + '/abstract_unit' 2 3 $KCODE = 'UTF8' 4 5 class CharsTest < Test::Unit::TestCase 6 7 def setup 8 @s = { 9 :utf8 => "Abcd Ðлå ï¬ Ð±Ð»a å", 10 :ascii => "asci ias c iia s", 11 :bytes => "\270\236\010\210\245" 12 } 13 end 14 15 def test_sanity 16 @s.each do |t, s| 17 assert s.respond_to?(:chars), "All string should have the chars method (#{t})" 18 assert s.respond_to?(:to_s), "All string should have the to_s method (#{t})" 19 assert_kind_of ActiveSupport::Multibyte::Chars, s.chars, "#chars should return an instance of Chars (#{t})" 20 end 21 end 22 23 def test_comparability 24 @s.each do |t, s| 25 assert_equal s, s.chars.to_s, "Chars#to_s should return enclosed string unchanged" 26 end 27 assert_nothing_raised do 28 assert_equal "a", "a", "Normal string comparisons should be unaffected" 29 assert_not_equal "a", "b", "Normal string comparisons should be unaffected" 30 assert_not_equal "a".chars, "b".chars, "Chars objects should be comparable" 31 assert_equal "a".chars, "A".downcase.chars, "Chars objects should be comparable to each other" 32 assert_equal "a".chars, "A".downcase, "Chars objects should be comparable to strings coming from elsewhere" 33 end 34 35 assert !@s[:utf8].eql?(@s[:utf8].chars), "Strict comparison is not supported" 36 assert_equal @s[:utf8], @s[:utf8].chars, "Chars should be compared by their enclosed string" 37 38 other_string = @s[:utf8].dup 39 assert_equal other_string, @s[:utf8].chars, "Chars should be compared by their enclosed string" 40 assert_equal other_string.chars, @s[:utf8].chars, "Chars should be compared by their enclosed string" 41 42 strings = ['builder'.chars, 'armor'.chars, 'zebra'.chars] 43 strings.sort! 44 assert_equal ['armor', 'builder', 'zebra'], strings, "Chars should be sortable based on their enclosed string" 45 46 # This leads to a StackLevelTooDeep exception if the comparison is not wired properly 47 assert_raise(NameError) do 48 Chars 49 end 50 end 51 52 def test_utf8? 53 assert @s[:utf8].is_utf8?, "UTF-8 strings are UTF-8" 54 assert @s[:ascii].is_utf8?, "All ASCII strings are also valid UTF-8" 55 assert !@s[:bytes].is_utf8?, "This bytestring isn't UTF-8" 56 end 57 58 # The test for the following methods are defined here because they can only be defined on the Chars class for 59 # various reasons 60 61 def test_gsub 62 assert_equal 'éxa', 'éda'.chars.gsub(/d/, 'x') 63 with_kcode('none') do 64 assert_equal 'éxa', 'éda'.chars.gsub(/d/, 'x') 65 end 66 end 67 68 def test_split 69 word = "eï¬cient" 70 chars = ["e", "ï¬", "c", "i", "e", "n", "t"] 71 assert_equal chars, word.split(//) 72 assert_equal chars, word.chars.split(//) 73 assert_kind_of ActiveSupport::Multibyte::Chars, word.chars.split(//).first, "Split should return Chars instances" 74 end 75 76 def test_regexp 77 with_kcode('none') do 78 assert_equal 12, (@s[:utf8].chars =~ /ï¬/), 79 "Regex matching should be bypassed to String" 80 end 81 with_kcode('UTF8') do 82 assert_equal 9, (@s[:utf8].chars =~ /ï¬/), 83 "Regex matching should be unicode aware" 84 end 85 end 86 87 def test_pragma 88 with_kcode('UTF8') do 89 assert " ".chars.send(:utf8_pragma?), "UTF8 pragma should be on because KCODE is UTF8" 90 end 91 with_kcode('none') do 92 assert !" ".chars.send(:utf8_pragma?), "UTF8 pragma should be off" 93 end 94 end 95 96 def test_handler_setting 97 handler = ''.chars.handler 98 99 ActiveSupport::Multibyte::Chars.handler = :first 100 assert_equal :first, ''.chars.handler 101 ActiveSupport::Multibyte::Chars.handler = :second 102 assert_equal :second, ''.chars.handler 103 assert_raise(NoMethodError) do 104 ''.chars.handler.split 105 end 106 107 ActiveSupport::Multibyte::Chars.handler = handler 108 end 109 110 def test_method_chaining 111 assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.downcase 112 assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.strip, "Strip should return a Chars object" 113 assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.downcase.strip, "The Chars object should be " + 114 "forwarded down the call path for chaining" 115 assert_equal 'foo', " FOO ".chars.normalize.downcase.strip, "The Chars that results from the " + 116 " operations should be comparable to the string value of the result" 117 end 118 119 def test_passthrough_on_kcode 120 # The easiest way to check if the passthrough is in place is through #size 121 with_kcode('nonce') do 122 assert_equal 26, @s[:utf8].chars.size 123 end 124 with_kcode('UTF8') do 125 assert_equal 17, @s[:utf8].chars.size 126 end 127 end 128 129 def test_destructiveness 130 # Note that we're testing the destructiveness here and not the correct behaviour of the methods 131 str = 'ac' 132 str.chars.insert(1, 'b') 133 assert_equal 'abc', str, 'Insert should be destructive for a string' 134 135 str = 'ac' 136 str.chars.reverse! 137 assert_equal 'ca', str, 'reverse! should be destructive for a string' 138 end 139 140 def test_resilience 141 assert_nothing_raised do 142 assert_equal 1, @s[:bytes].chars.size, "There's only one valid utf-8 byte in the string" 143 end 144 assert_nothing_raised do 145 assert_equal "\010", @s[:bytes].chars.reverse, "There's only one valid utf-8 byte in the string" 146 end 147 assert_nothing_raised do 148 @s[:bytes].chars.reverse! 149 assert_equal "\010", @s[:bytes], "There's only one valid utf-8 byte in the string" 150 end 151 end 152 153 protected 154 155 def with_kcode(kcode) 156 old_kcode, $KCODE = $KCODE, kcode 157 begin 158 yield 159 ensure 160 $KCODE = old_kcode 161 end 162 end 163 end -
activesupport/test/multibyte_handler_test.rb
old new 1 require File.dirname(__FILE__) + '/abstract_unit' 2 3 $KCODE = 'UTF8' 4 5 class String 6 # Unicode Inspect returns the codepoints of the string in hex 7 def ui 8 "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' ')) 9 end unless ''.respond_to?(:ui) 10 end 11 12 module UTF8HandlingTest 13 14 def common_setup 15 # This is an ASCII string with some russian strings and a ligature. It's nicely calibrated, because 16 # slicing it at some specific bytes will kill your characters if you use standard Ruby routines. 17 # It has both capital and standard letters, so that we can test case conversions easily. 18 # It has 26 charactes and 28 when the ligature gets split during normalization. 19 @string = "Abcd Ðлå ï¬ Ð±Ð»Ð° бла бла бла" 20 @string_kd = "Abcd ÐлaÌ ffi бла бла бла бла" 21 @string_kc = "Abcd Ðлå ffi бла бла бла бла" 22 @string_c = "Abcd Ðлå ï¬ Ð±Ð»Ð° бла бла бла" 23 @string_d = "Abcd ÐлaÌ ï¬ Ð±Ð»Ð° бла бла бла" 24 @bytestring = "\270\236\010\210\245" # Not UTF-8 25 26 # Characters from the character classes as described in UAX #29 27 @character_from_class = { 28 :l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A, 29 :extend => 0x094D, :n => 0x64 30 } 31 end 32 33 def test_utf8_recognition 34 assert ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@string), 35 "Should recognize as a valid UTF-8 string" 36 assert !ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@bytestring), "This is bytestring, not UTF-8" 37 end 38 39 def test_simple_normalization 40 null_byte_str = "Test\0test" 41 42 assert_equal '', @handler.normalize(''), "Empty string should not break things" 43 assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kc).ui, "Null byte should remain" 44 assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :c).ui, "Null byte should remain" 45 assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :d).ui, "Null byte should remain" 46 assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kd).ui, "Null byte should remain" 47 assert_equal null_byte_str.ui, @handler.decompose(null_byte_str).ui, "Null byte should remain" 48 assert_equal null_byte_str.ui, @handler.compose(null_byte_str).ui, "Null byte should remain" 49 50 comp_str = [ 51 44, # LATIN CAPITAL LETTER D 52 307, # COMBINING DOT ABOVE 53 328, # COMBINING OGONEK 54 323 # COMBINING DOT BELOW 55 ].pack("U*") 56 norm_str_KC = [44,105,106,328,323].pack("U*") 57 norm_str_C = [44,307,328,323].pack("U*") 58 norm_str_D = [44,307,110,780,78,769].pack("U*") 59 norm_str_KD = [44,105,106,110,780,78,769].pack("U*") 60 61 assert_equal norm_str_KC.ui, @handler.normalize(comp_str, :kc).ui, "Should normalize KC" 62 assert_equal norm_str_C.ui, @handler.normalize(comp_str, :c).ui, "Should normalize C" 63 assert_equal norm_str_D.ui, @handler.normalize(comp_str, :d).ui, "Should normalize D" 64 assert_equal norm_str_KD.ui, @handler.normalize(comp_str, :kd).ui, "Should normalize KD" 65 66 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.normalize(@bytestring) } 67 end 68 69 # Test for the Public Review Issue #29, bad explaination of composition might lead to a 70 # bad implementation: http://www.unicode.org/review/pr-29.html 71 def test_normalization_C_pri_29 72 [ 73 [0x0B47, 0x0300, 0x0B3E], 74 [0x1100, 0x0300, 0x1161] 75 ].map { |c| c.pack('U*') }.each do |c| 76 assert_equal c.ui, @handler.normalize(c, :c).ui, "Composition is implemented incorrectly" 77 end 78 end 79 80 def test_casefolding 81 simple_str = "abCdef" 82 simple_str_upcase = "ABCDEF" 83 simple_str_downcase = "abcdef" 84 85 assert_equal '', @handler.downcase(@handler.upcase('')), "Empty string should not break things" 86 assert_equal simple_str_upcase, @handler.upcase(simple_str), "should upcase properly" 87 assert_equal simple_str_downcase, @handler.downcase(simple_str), "should downcase properly" 88 assert_equal simple_str_downcase, @handler.downcase(@handler.upcase(simple_str_downcase)), "upcase and downcase should be mirrors" 89 90 rus_str = "аÐвгЎ\0f" 91 rus_str_upcase = "ÐÐÐÐÐ\0F" 92 rus_str_downcase = "абвгЎ\0f" 93 assert_equal rus_str_upcase, @handler.upcase(rus_str), "should upcase properly honoring null-byte" 94 assert_equal rus_str_downcase, @handler.downcase(rus_str), "should downcase properly honoring null-byte" 95 96 jap_str = "ã®åã蟌ã¿å察å¿ã¯ã»ãŒå®æ" 97 assert_equal jap_str, @handler.upcase(jap_str), "Japanse has no upcase, should remain unchanged" 98 assert_equal jap_str, @handler.downcase(jap_str), "Japanse has no downcase, should remain unchanged" 99 100 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.upcase(@bytestring) } 101 end 102 103 def test_capitalize 104 { 'аÐвг аÐвг' => 'Ðбвг абвг', 105 'аÐвг ÐÐÐÐ' => 'Ðбвг абвг', 106 'ÐÐÐÐ ÐÐÐÐ' => 'Ðбвг абвг', 107 '' => '' }.each do |f,t| 108 assert_equal t, @handler.capitalize(f), "Capitalize should work as expected" 109 end 110 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.capitalize(@bytestring) } 111 end 112 113 def test_translate_offset 114 str = "ÐлaÃ¥" # [2, 2, 1, 2] bytes 115 assert_equal 0, @handler.translate_offset('', 0), "Offset for an empty string makes no sense, return 0" 116 assert_equal 0, @handler.translate_offset(str, 0), "First character, first byte" 117 assert_equal 0, @handler.translate_offset(str, 1), "First character, second byte" 118 assert_equal 1, @handler.translate_offset(str, 2), "Second character, third byte" 119 assert_equal 1, @handler.translate_offset(str, 3), "Second character, fourth byte" 120 assert_equal 2, @handler.translate_offset(str, 4), "Third character, fifth byte" 121 assert_equal 3, @handler.translate_offset(str, 5), "Fourth character, sixth byte" 122 assert_equal 3, @handler.translate_offset(str, 6), "Fourth character, seventh byte" 123 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.translate_offset(@bytestring, 3) } 124 end 125 126 def test_insert 127 assert_equal '', @handler.insert('', 0, ''), "Empty string should not break things" 128 assert_equal "Abcd Ðлå ï¬ÐУРбла бла бла бла", @handler.insert(@string, 10, "ÐУÐ"), 129 "Text should be inserted at right codepoints" 130 assert_equal "Abcd Ðлå ï¬ÐУРбла бла бла бла", @string, "Insert should be destructive" 131 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) do 132 @handler.insert(@bytestring, 2, "\210") 133 end 134 end 135 136 def test_reverse 137 str = "wÐлåa \n" 138 rev = "\n aåлÐw" 139 assert_equal '', @handler.reverse(''), "Empty string shouldn't change" 140 assert_equal rev.ui, @handler.reverse(str).ui, "Should reverse properly" 141 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.reverse(@bytestring) } 142 end 143 144 def test_size 145 assert_equal 0, @handler.size(''), "Empty string has size 0" 146 assert_equal 26, @handler.size(@string), "String length should be 26" 147 assert_equal 26, @handler.length(@string), "String length method should be properly aliased" 148 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.size(@bytestring) } 149 end 150 151 def test_slice 152 assert_equal 0x41, @handler.slice(@string, 0), "Singular characters should return codepoints" 153 assert_equal 0xE5, @handler.slice(@string, 7), "Singular characters should return codepoints" 154 assert_equal nil, @handler.slice('', -1..1), "Broken range should return nil" 155 assert_equal '', @handler.slice('', 0..10), "Empty string should not break things" 156 assert_equal "d Ðлå ï¬", @handler.slice(@string, 3..9), "Unicode characters have to be returned" 157 assert_equal " Ðлå ï¬ ", @handler.slice(@string, 4..10), "Unicode characters have to be returned" 158 assert_equal "", @handler.slice(@string, 7..6), "Range is empty, should return an empty string" 159 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.slice(@bytestring, 2..3) } 160 end 161 162 def test_grapheme_cluster_length 163 assert_equal 0, @handler.g_length(''), "String should count 0 grapheme clusters" 164 assert_equal 2, @handler.g_length([0x0924, 0x094D, 0x0930].pack('U*')), "String should count 2 grapheme clusters" 165 assert_equal 1, @handler.g_length(string_from_classes(%w(cr lf))), "Don't cut between CR and LF" 166 assert_equal 1, @handler.g_length(string_from_classes(%w(l l))), "Don't cut between L" 167 assert_equal 1, @handler.g_length(string_from_classes(%w(l v))), "Don't cut between L and V" 168 assert_equal 1, @handler.g_length(string_from_classes(%w(l lv))), "Don't cut between L and LV" 169 assert_equal 1, @handler.g_length(string_from_classes(%w(l lvt))), "Don't cut between L and LVT" 170 assert_equal 1, @handler.g_length(string_from_classes(%w(lv v))), "Don't cut between LV and V" 171 assert_equal 1, @handler.g_length(string_from_classes(%w(lv t))), "Don't cut between LV and T" 172 assert_equal 1, @handler.g_length(string_from_classes(%w(v v))), "Don't cut between V and V" 173 assert_equal 1, @handler.g_length(string_from_classes(%w(v t))), "Don't cut between V and T" 174 assert_equal 1, @handler.g_length(string_from_classes(%w(lvt t))), "Don't cut between LVT and T" 175 assert_equal 1, @handler.g_length(string_from_classes(%w(t t))), "Don't cut between T and T" 176 assert_equal 1, @handler.g_length(string_from_classes(%w(n extend))), "Don't cut before Extend" 177 assert_equal 2, @handler.g_length(string_from_classes(%w(n n))), "Cut between normal characters" 178 assert_equal 3, @handler.g_length(string_from_classes(%w(n cr lf n))), "Don't cut between CR and LF" 179 assert_equal 2, @handler.g_length(string_from_classes(%w(n l v t))), "Don't cut between L, V and T" 180 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.g_length(@bytestring) } 181 end 182 183 def test_index 184 s = "ÎαληΌÎÏα κÏÏΌε!" 185 assert_equal 0, @handler.index('', ''), "The empty string is always found at the beginning of the string" 186 assert_equal 0, @handler.index('haystack', ''), "The empty string is always found at the beginning of the string" 187 assert_equal 0, @handler.index(s, 'Î'), "Greek K is at 0" 188 assert_equal 1, @handler.index(s, 'α'), "Greek Alpha is at 1" 189 190 assert_equal nil, @handler.index(@bytestring, 'a') 191 assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.index(@bytestring, "\010") } 192 end 193 194 def test_strip 195 # A unicode aware version of strip should strip all 26 types of whitespace. This includes the NO BREAK SPACE 196 # aka BOM (byte order mark). The byte order mark has no place in UTF-8 because it's used to detect LE and BE. 197 b = "\n" + [ 198 32, # SPACE 199 8195, # EM SPACE 200 8199, # FIGURE SPACE, 201 8201, # THIN SPACE 202 8202, # HAIR SPACE 203 65279, # NO BREAK SPACE (ZW) 204 ].pack('U*') 205 m = "word блОМ\n\n\n word" 206 e = [ 207 65279, # NO BREAK SPACE (ZW) 208 8201, # THIN SPACE 209 8199, # FIGURE SPACE, 210 32, # SPACE 211 ].pack('U*') 212 string = b+m+e 213 214 assert_equal '', @handler.strip(''), "Empty string should stay empty" 215 assert_equal m+e, @handler.lstrip(string), "Whitespace should be gone on the left" 216 assert_equal b+m, @handler.rstrip(string), "Whitespace should be gone on the right" 217 assert_equal m, @handler.strip(string), "Whitespace should be stripped on both sides" 218 219 bs = "\n #{@bytestring} \n\n" 220 assert_equal @bytestring, @handler.strip(bs), "Invalid unicode strings should still strip" 221 end 222 223 def test_tidy_bytes 224 assert_equal "\010", @handler.tidy_bytes(@bytestring) 225 assert_equal "a\010a", @handler.tidy_bytes('a' + @bytestring + 'a') 226 assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') } 227 end 228 229 protected 230 231 def string_from_classes(classes) 232 classes.collect do |k| 233 @character_from_class[k.intern] 234 end.pack('U*') 235 end 236 end 237 238 239 begin 240 require_library_or_gem('utf8proc_native') 241 require 'active_record/multibyte/handlers/utf8_handler_proc' 242 class UTF8HandlingTestProc < Test::Unit::TestCase 243 include UTF8HandlingTest 244 def setup 245 common_setup 246 @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc 247 end 248 end 249 rescue LoadError 250 end 251 252 class UTF8HandlingTestPure < Test::Unit::TestCase 253 include UTF8HandlingTest 254 def setup 255 common_setup 256 @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler 257 end 258 end -
activesupport/Rakefile
old new 21 21 t.warning = true 22 22 } 23 23 24 desc 'Runs the conformance tests for unicode operations' 25 task :test_conformance do 26 `ruby test/multibyte_conformance.rb` 27 end 28 24 29 # Create compressed packages 25 30 dist_dirs = [ "lib", "test"] 26 31 … … 29 34 Rake::RDocTask.new { |rdoc| 30 35 rdoc.rdoc_dir = 'doc' 31 36 rdoc.title = "Active Support -- Utility classes and standard library extensions from Rails" 32 rdoc.options << '--line-numbers' << '--inline-source' 37 rdoc.options << '--line-numbers' << '--inline-source' << '--charset=utf-8' 33 38 rdoc.template = "#{ENV['template']}.rb" if ENV['template'] 34 39 rdoc.rdoc_files.include('README', 'CHANGELOG') 35 40 rdoc.rdoc_files.include('lib/active_support.rb') -
activesupport/lib/active_support/multibyte.rb
old new 1 module ActiveSupport::Multibyte 2 DEFAULT_NORMALIZATION_FORM = :kc 3 NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd] 4 end 5 6 require 'active_support/multibyte/chars' -
activesupport/lib/active_support/core_ext/string.rb
old new 3 3 require File.dirname(__FILE__) + '/string/access' 4 4 require File.dirname(__FILE__) + '/string/starts_ends_with' 5 5 require File.dirname(__FILE__) + '/string/iterators' 6 require File.dirname(__FILE__) + '/string/unicode' 6 7 7 8 class String #:nodoc: 8 9 include ActiveSupport::CoreExtensions::String::Access … … 10 11 include ActiveSupport::CoreExtensions::String::Inflections 11 12 include ActiveSupport::CoreExtensions::String::StartsEndsWith 12 13 include ActiveSupport::CoreExtensions::String::Iterators 14 include ActiveSupport::CoreExtensions::String::Unicode 13 15 end -
activesupport/lib/active_support/core_ext/string/unicode.rb
old new 1 module ActiveSupport #:nodoc: 2 module CoreExtensions #:nodoc: 3 module String #:nodoc: 4 # Define methods for handeling unicode data. 5 module Unicode 6 # +chars+ is a Unicode safe proxy for string methods. It creates and returns an instance of the 7 # ActiveSupport::Multibyte::Chars class which encapsulates the original string. A Unicode safe version of all 8 # the String methods are defined on this proxy class. Undefined methods are forwarded to String, so all of the 9 # string overrides can also be called through the +chars+ proxy. 10 # 11 # name = 'Claus MÃŒller' 12 # name.reverse #=> "rell??M sualC" 13 # name.length #=> 13 14 # 15 # name.chars.reverse.to_s #=> "rellÃŒM sualC" 16 # name.chars.length #=> 12 17 # 18 # 19 # All the methods on the chars proxy which normally return a string will return a Chars object. This allows 20 # method chaining on the result of any of these methods. 21 # 22 # name.chars.reverse.length #=> 12 23 # 24 # The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between 25 # String and Char work like expected. The bang! methods change the internal string representation in the Chars 26 # object. Interoperability problems can be resolved easily with a +to_s+ call. 27 # 28 # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars and 29 # ActiveSupport::Multibyte::Handlers::UTF8Handler 30 def chars 31 ActiveSupport::Multibyte::Chars.new(self) 32 end 33 34 # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have 35 # them), returns false otherwise. 36 def is_utf8? 37 ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(self) 38 end 39 end 40 end 41 end 42 end -
activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
old new 1 # Contains all the handlers and helper classes 2 module ActiveSupport::Multibyte::Handlers 3 class EncodingError < ArgumentError; end 4 5 class Codepoint #:nodoc: 6 attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping 7 end 8 9 class UnicodeDatabase #:nodoc: 10 attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary 11 12 # Creates a new UnicodeDatabase instance and loads the database. 13 def initialize 14 begin 15 @codepoints, @composition_exclusion, @composition_map, @boundary = self.class.load 16 rescue Exception => e 17 $stderr.write "Couldn't load the unicode tables for UTF8Handler (#{e.to_s}), handler is unusable\n" 18 end 19 @codepoints ||= Hash.new(Codepoint.new) 20 @composition_exclusion ||= [] 21 @composition_map ||= {} 22 @boundary ||= {} 23 24 # Redefine the === method so we can write shorter rules for grapheme cluster breaks 25 @boundary.each do |k,_| 26 @boundary[k].instance_eval do 27 def ===(other) 28 detect { |i| i === other } ? true : false 29 end 30 end if @boundary[k].kind_of?(Array) 31 end 32 end 33 34 # Shortcut to ucd.codepoints[] 35 def [](index); @codepoints[index]; end 36 37 # Returns the filename of the unicode database 38 def self.filename 39 File.expand_path File.dirname(__FILE__) + '/../../values/unicode_tables.dat' 40 end 41 42 # Loads the unicode database and returns all the internal objects of UnicodeDatabase 43 def self.load; File.open(self.filename) { |f| Marshal.load f.read }; end 44 end 45 46 # UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars 47 # proxy when $KCODE is set to 'UTF8'. 48 class UTF8Handler 49 # UniCode Database 50 UCD = UnicodeDatabase.new 51 52 # Hangul character boundaries and properties 53 HANGUL_SBASE = 0xAC00 54 HANGUL_LBASE = 0x1100 55 HANGUL_VBASE = 0x1161 56 HANGUL_TBASE = 0x11A7 57 HANGUL_LCOUNT = 19 58 HANGUL_VCOUNT = 21 59 HANGUL_TCOUNT = 28 60 HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT 61 HANGUL_SCOUNT = 11172 62 HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT 63 HANGUL_JAMO_FIRST = 0x1100 64 HANGUL_JAMO_LAST = 0x11FF 65 66 # All the unicode whitespace 67 UNICODE_WHITESPACE = [ 68 (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D> 69 0x0020, # White_Space # Zs SPACE 70 0x0085, # White_Space # Cc <control-0085> 71 0x00A0, # White_Space # Zs NO-BREAK SPACE 72 0x1680, # White_Space # Zs OGHAM SPACE MARK 73 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR 74 (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE 75 0x2028, # White_Space # Zl LINE SEPARATOR 76 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 77 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 78 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 79 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE 80 ].flatten.freeze 81 82 # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish 83 # between little and big endian. This is not an issue in utf-8, so it must be ignored. 84 UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM 85 86 # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) 87 UTF8_PAT = /\A(?: 88 [\x00-\x7f] | 89 [\xc2-\xdf] [\x80-\xbf] | 90 \xe0 [\xa0-\xbf] [\x80-\xbf] | 91 [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | 92 \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | 93 [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | 94 \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] 95 )*\z/xn 96 97 # Returns a regular expression pattern that matches the passed Unicode codepoints 98 def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: 99 array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|') 100 end 101 UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ 102 UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ 103 104 class << self 105 106 # /// 107 # /// BEGIN String method overrides 108 # /// 109 110 # Inserts the passed string at specified codepoint offsets 111 def insert(str, offset, fragment) 112 str.replace( 113 u_unpack(str).insert( 114 offset, 115 u_unpack(fragment) 116 ).flatten.pack('U*') 117 ) 118 end 119 120 # Returns the position of the passed argument in the string, counting in codepoints 121 def index(str, *args) 122 bidx = str.index(*args) 123 bidx ? (u_unpack(str.slice(0...bidx)).size) : nil 124 end 125 126 # Does Unicode-aware rstrip 127 def rstrip(str) 128 str.gsub(UNICODE_TRAILERS_PAT, '') 129 end 130 131 # Does Unicode-aware lstrip 132 def lstrip(str) 133 str.gsub(UNICODE_LEADERS_PAT, '') 134 end 135 136 # Removed leading and trailing whitespace 137 def strip(str) 138 str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '') 139 end 140 141 # Returns the number of codepoints in the string 142 def size(str) 143 u_unpack(str).size 144 end 145 alias_method :length, :size 146 147 # Reverses codepoints in the string. 148 def reverse(str) 149 u_unpack(str).reverse.pack('U*') 150 end 151 152 # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that 153 # character. 154 def slice(str, *args) 155 if (args.size == 2 && args.first.is_a?(Range)) 156 raise TypeError, 'cannot convert Range into Integer' # Do as if we were native 157 elsif args[0].kind_of? Range 158 cps = u_unpack(str).slice(*args) 159 cps.nil? ? nil : cps.pack('U*') 160 elsif args[0].kind_of? Numeric 161 u_unpack(str)[args[0]] 162 else 163 str.slice(*args) 164 end 165 end 166 alias_method :[], :slice 167 168 # Convert characters in the string to uppercase 169 def upcase(str); to_case :uppercase_mapping, str; end 170 171 # Convert characters in the string to lowercase 172 def downcase(str); to_case :lowercase_mapping, str; end 173 174 # Returns a copy of +str+ with the first character converted to uppercase and the remainder to lowercase 175 def capitalize(str) 176 upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '') 177 end 178 179 # /// 180 # /// Extra String methods for unicode operations 181 # /// 182 183 # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for 184 # passing strings to databases and validations. 185 # 186 # * <tt>str</tt>: The string to perform normalization on. 187 # * <tt>form</tt>: The form you want to normalize in. Should be one of the following: :c, :kc, :d or :kd. 188 def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM) 189 # See http://www.unicode.org/reports/tr15, Table 1 190 codepoints = u_unpack(str) 191 case form 192 when :d 193 reorder_characters(decompose_codepoints(:canonical, codepoints)) 194 when :c 195 compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints)) 196 when :kd 197 reorder_characters(decompose_codepoints(:compatability, codepoints)) 198 when :kc 199 compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints)) 200 else 201 raise ArgumentError, "#{form} is not a valid normalization variant", caller 202 end.pack('U*') 203 end 204 205 # Perform decomposition on the characters in the string 206 def decompose(str) 207 decompose_codepoints(:canonical, u_unpack(str)).pack('U*') 208 end 209 210 # Perform composition on the characters in the string 211 def compose(str) 212 compose_codepoints u_unpack(str).pack('U*') 213 end 214 215 # /// 216 # /// BEGIN Helper methods for unicode operation 217 # /// 218 219 # Used to translate an offset from bytes to characters, for instance one received from a regular expression match 220 def translate_offset(str, byte_offset) 221 return 0 if str == '' 222 return nil if byte_offset.nil? 223 chunk = str[0..byte_offset] 224 begin 225 begin 226 chunk.unpack('U*').length - 1 227 rescue ArgumentError => e 228 chunk = str[0..(byte_offset+=1)] 229 # Stop retrying at the end of the string 230 raise e unless byte_offset < chunk.length 231 # We damaged a character, retry 232 retry 233 end 234 # Catch the ArgumentError so we can throw our own 235 rescue ArgumentError 236 raise EncodingError.new('malformed UTF-8 character') 237 end 238 end 239 240 # Checks if the string is valid UTF8. 241 def consumes?(str) 242 # Unpack is a little bit faster than regular expressions 243 begin 244 str.unpack('U*') 245 true 246 rescue ArgumentError 247 false 248 end 249 end 250 251 # Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed 252 # in future versions. 253 def g_length(str) 254 g_unpack(str).length 255 end 256 257 # Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string 258 def tidy_bytes(str) 259 str.split(//u).reject { |c| !UTF8_PAT.match(c) }.join 260 end 261 262 protected 263 264 # Detect whether the codepoint is in a certain character class. Primarily used by the 265 # grapheme cluster support. 266 def in_char_class?(codepoint, classes) 267 classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false 268 end 269 270 # Unpack the string at codepoints boundaries 271 def u_unpack(str) 272 begin 273 str.unpack 'U*' 274 rescue ArgumentError 275 raise EncodingError.new('malformed UTF-8 character') 276 end 277 end 278 279 # Unpack the string at grapheme boundaries instead of codepoint boundaries 280 def g_unpack(str) 281 codepoints = u_unpack(str) 282 unpacked = [] 283 pos = 0 284 marker = 0 285 eoc = codepoints.length 286 while(pos < eoc) 287 pos += 1 288 previous = codepoints[pos-1] 289 current = codepoints[pos] 290 if ( 291 # CR X LF 292 one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or 293 # L X (L|V|LV|LVT) 294 two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or 295 # (LV|V) X (V|T) 296 three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or 297 # (LVT|T) X (T) 298 four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or 299 # X Extend 300 five = (UCD.boundary[:extend] === current) 301 ) 302 else 303 unpacked << codepoints[marker..pos-1] 304 marker = pos 305 end 306 end 307 unpacked 308 end 309 310 # Reverse operation of g_unpack 311 def g_pack(unpacked) 312 unpacked.flatten 313 end 314 315 # Convert characters to a different case 316 def to_case(way, str) 317 u_unpack(str).map do |codepoint| 318 cp = UCD[codepoint] 319 unless cp.nil? 320 ncp = cp.send(way) 321 ncp > 0 ? ncp : codepoint 322 else 323 codepoint 324 end 325 end.pack('U*') 326 end 327 328 # Re-order codepoints so the string becomes canonical 329 def reorder_characters(codepoints) 330 length = codepoints.length- 1 331 pos = 0 332 while pos < length do 333 cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]] 334 if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) 335 codepoints[pos..pos+1] = cp2.code, cp1.code 336 pos += (pos > 0 ? -1 : 1) 337 else 338 pos += 1 339 end 340 end 341 codepoints 342 end 343 344 # Decompose composed characters to the decomposed form 345 def decompose_codepoints(type, codepoints) 346 codepoints.inject([]) do |decomposed, cp| 347 # if it's a hangul syllable starter character 348 if HANGUL_SBASE <= cp and cp < HANGUL_SLAST 349 sindex = cp - HANGUL_SBASE 350 ncp = [] # new codepoints 351 ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT 352 ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT 353 tindex = sindex % HANGUL_TCOUNT 354 ncp << (HANGUL_TBASE + tindex) unless tindex == 0 355 decomposed.concat ncp 356 # if the codepoint is decomposable in with the current decomposition type 357 elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability) 358 decomposed.concat decompose_codepoints(type, ncp.dup) 359 else 360 decomposed << cp 361 end 362 end 363 end 364 365 # Compose decomposed characters to the composed form 366 def compose_codepoints(codepoints) 367 pos = 0 368 eoa = codepoints.length - 1 369 starter_pos = 0 370 starter_char = codepoints[0] 371 previous_combining_class = -1 372 while pos < eoa 373 pos += 1 374 lindex = starter_char - HANGUL_LBASE 375 # -- Hangul 376 if 0 <= lindex and lindex < HANGUL_LCOUNT 377 vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 378 if 0 <= vindex and vindex < HANGUL_VCOUNT 379 tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 380 if 0 <= tindex and tindex < HANGUL_TCOUNT 381 j = starter_pos + 2 382 eoa -= 2 383 else 384 tindex = 0 385 j = starter_pos + 1 386 eoa -= 1 387 end 388 codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE 389 end 390 starter_pos += 1 391 starter_char = codepoints[starter_pos] 392 # -- Other characters 393 else 394 current_char = codepoints[pos] 395 current = UCD[current_char] 396 if current.combining_class > previous_combining_class 397 if ref = UCD.composition_map[starter_char] 398 composition = ref[current_char] 399 else 400 composition = nil 401 end 402 unless composition.nil? 403 codepoints[starter_pos] = composition 404 starter_char = composition 405 codepoints.delete_at pos 406 eoa -= 1 407 pos -= 1 408 previous_combining_class = -1 409 else 410 previous_combining_class = current.combining_class 411 end 412 else 413 previous_combining_class = current.combining_class 414 end 415 if current.combining_class == 0 416 starter_pos = pos 417 starter_char = codepoints[pos] 418 end 419 end 420 end 421 codepoints 422 end 423 end 424 end 425 end -
activesupport/lib/active_support/multibyte/handlers/passthru_handler.rb
old new 1 # Chars uses this handler when $KCODE is not set to 'UTF8'. Because this handler doesn't define any methods all call 2 # will be forwarded to String. 3 class ActiveSupport::Multibyte::Handlers::PassthruHandler 4 5 # Return the original byteoffset 6 def self.translate_offset(string, byte_offset) #:nodoc: 7 byte_offset 8 end 9 end -
activesupport/lib/active_support/multibyte/handlers/utf8_handler_proc.rb
old new 1 # Methods in this handler call functions in the utf8proc ruby extension. These are significantly faster than the 2 # pure ruby versions. Chars automatically uses this handler when it can load the utf8proc extension. For 3 # documentation on handler methods see UTF8Handler. 4 class ActiveSupport::Multibyte::Handlers::UTF8HandlerProc < ActiveSupport::Multibyte::Handlers::UTF8Handler 5 6 class << self 7 def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM) #:nodoc: 8 codepoints = str.unpack('U*') 9 case form 10 when :d 11 utf8map(str, :stable) 12 when :c 13 utf8map(str, :stable, :compose) 14 when :kd 15 utf8map(str, :stable, :compat) 16 when :kc 17 utf8map(str, :stable, :compose, :compat) 18 else 19 raise ArgumentError, "#{form} is not a valid normalization variant", caller 20 end 21 end 22 23 def decompose(str) #:nodoc: 24 utf8map(str, :stable) 25 end 26 27 def downcase(str) #:nodoc:c 28 utf8map(str, :casefold) 29 end 30 31 protected 32 33 def utf8map(str, *option_array) #:nodoc: 34 options = 0 35 option_array.each do |option| 36 flag = Utf8Proc::Options[option] 37 raise ArgumentError, "Unknown argument given to utf8map." unless 38 flag 39 options |= flag 40 end 41 return Utf8Proc::utf8map(str, options) 42 end 43 end 44 end -
activesupport/lib/active_support/multibyte/chars.rb
old new 1 require 'active_support/multibyte/handlers/utf8_handler' 2 require 'active_support/multibyte/handlers/passthru_handler' 3 4 # Encapsulates all the functionality related to the Chars proxy. 5 module ActiveSupport::Multibyte 6 # Chars enables you to work transparently with multibyte encodings in the Ruby String class without having extensive 7 # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an 8 # encoding safe manner. All the normal String methods are also implemented on the proxy. 9 # 10 # String methods are proxied through the Chars object, and can be accessed through the +chars+ method. Methods 11 # which would normally return a String object now return a Chars object so methods can be chained. 12 # 13 # "The Perfect String ".chars.downcase.strip.normalize #=> "the perfect string" 14 # 15 # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made. 16 # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them. 17 # 18 # bad.explicit_checking_method "T".chars.downcase.to_s 19 # 20 # The actual operations on the string are delegated to handlers. Theoretically handlers can be implemented for 21 # any encoding, but the default handler handles UTF-8. This handler is set during initialization, if you want to 22 # use you own handler, you can set it on the Chars class. Look at the UTF8Handler source for an example how to 23 # implement your own handler. If you your own handler to work on anything but UTF-8 you probably also 24 # want to override Chars#handler. 25 # 26 # ActiveSupport::Multibyte::Chars.handler = MyHandler 27 # 28 # Note that a few methods are defined on Chars instead of the handler because they are defined on Object or Kernel 29 # and method_missing can't catch them. 30 class Chars 31 32 attr_reader :string # The contained string 33 alias_method :to_s, :string 34 35 include Comparable 36 37 # The magic method to make String and Chars comparable 38 def to_str 39 # Using any other ways of overriding the String itself will lead you all the way from infinite loops to 40 # core dumps. Don't go there. 41 @string 42 end 43 44 # Create a new Chars instance. 45 def initialize(str) 46 @string = (str.string rescue str) 47 end 48 49 # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, equal or after the 50 # object on the right side of the operation. It accepts any object that implements +to_s+. See String.<=> 51 # for more details. 52 def <=>(other); @string <=> other.to_s; end 53 54 # Works just like String#split, with the exception that the items in the resulting list are Chars 55 # instances instead of String. This makes chaining methods easier. 56 def split(*args) 57 @string.split(*args).map { |i| i.chars } 58 end 59 60 # Gsub works exactly the same as gsub on a normal string. 61 def gsub(*a, &b); @string.gsub(*a, &b).chars; end 62 63 # Like String.=~ only it returns the character offset (in codepoints) instead of the byte offset. 64 def =~(other) 65 handler.translate_offset(@string, @string =~ other) 66 end 67 68 # Try to forward all undefined methods to the handler, when a method is not defined on the handler, send it to 69 # the contained string. Method_missing is also responsible for making the bang! methods destructive. 70 def method_missing(m, *a, &b) 71 begin 72 # Simulate methods with a ! at the end because we can't touch the enclosed string from the handlers. 73 if m.to_s =~ /^(.*)\!$/ 74 result = handler.send($1, @string, *a, &b) 75 if result == @string 76 result = nil 77 else 78 @string.replace result 79 end 80 else 81 result = handler.send(m, @string, *a, &b) 82 end 83 rescue NoMethodError 84 result = @string.send(m, *a, &b) 85 rescue Handlers::EncodingError 86 @string.replace handler.tidy_bytes(@string) 87 retry 88 end 89 90 if result.kind_of?(String) 91 result.chars 92 else 93 result 94 end 95 end 96 97 # Set the handler class for the Char objects. 98 def self.handler=(klass) 99 @@handler = klass 100 end 101 102 # Returns the proper handler for the contained string depending on $KCODE and the encoding of the string. This 103 # method is used internally to always redirect messages to the proper classes depending on the context. 104 def handler 105 if utf8_pragma? 106 @@handler 107 else 108 ActiveSupport::Multibyte::Handlers::PassthruHandler 109 end 110 end 111 112 private 113 114 # +utf8_pragma+ checks if it can send this string to the handlers. It makes sure @string isn't nil and $KCODE is 115 # set to 'UTF8'. 116 def utf8_pragma? 117 !@string.nil? && ($KCODE == 'UTF8') 118 end 119 end 120 end 121 122 # When we can load the utf8proc library, override normalization with the faster methods 123 begin 124 require_library_or_gem('utf8proc_native') 125 require 'active_support/multibyte/handlers/utf8_handler_proc' 126 ActiveSupport::Multibyte::Chars.handler = ActiveSupport::Multibyte::Handlers::UTF8HandlerProc 127 rescue LoadError 128 ActiveSupport::Multibyte::Chars.handler = ActiveSupport::Multibyte::Handlers::UTF8Handler 129 end -
activesupport/lib/active_support/multibyte/generators/generate_tables.rb
old new 1 #!/usr/bin/env ruby 2 3 require File.dirname(__FILE__) + '/../../../active_support' 4 require 'open-uri' 5 6 module ActiveSupport::Multibyte::Handlers 7 class UnicodeTableGenerator #:nodoc: 8 SOURCES = { 9 :codepoints => 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt', 10 :composition_exclusion => 'http://www.unicode.org/Public/5.0.0/ucd/CompositionExclusions.txt', 11 :grapheme_break_property => 'http://www.unicode.org/Public/5.0.0/ucd/auxiliary/GraphemeBreakProperty.txt' 12 } 13 14 def initialize 15 @ucd = UnicodeDatabase.new 16 17 default = Codepoint.new 18 default.combining_class = 0 19 default.uppercase_mapping = 0 20 default.lowercase_mapping = 0 21 @ucd.codepoints = Hash.new(default) 22 23 @ucd.composition_exclusion = [] 24 @ucd.composition_map = {} 25 @ucd.boundary = {} 26 end 27 28 def parse_codepoints(line) 29 codepoint = Codepoint.new 30 raise "Could not parse input." unless line =~ /^ 31 ([0-9A-F]+); # code 32 ([^;]+); # name 33 ([A-Z]+); # general category 34 ([0-9]+); # canonical combining class 35 ([A-Z]+); # bidi class 36 (<([A-Z]*)>)? # decomposition type 37 ((\ ?[0-9A-F]+)*); # decompomposition mapping 38 ([0-9]*); # decimal digit 39 ([0-9]*); # digit 40 ([^;]*); # numeric 41 ([YN]*); # bidi mirrored 42 ([^;]*); # unicode 1.0 name 43 ([^;]*); # iso comment 44 ([0-9A-F]*); # simple uppercase mapping 45 ([0-9A-F]*); # simple lowercase mapping 46 ([0-9A-F]*)$/ix # simple titlecase mapping 47 codepoint.code = $1.hex 48 #codepoint.name = $2 49 #codepoint.category = $3 50 codepoint.combining_class = Integer($4) 51 #codepoint.bidi_class = $5 52 codepoint.decomp_type = $7 53 codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex } 54 #codepoint.bidi_mirrored = ($13=='Y') ? true : false 55 codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex 56 codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex 57 #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex 58 @ucd.codepoints[codepoint.code] = codepoint 59 end 60 61 def parse_grapheme_break_property(line) 62 if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/ 63 type = $2.downcase.intern 64 @ucd.boundary[type] ||= [] 65 if $1.include? '..' 66 parts = $1.split '..' 67 @ucd.boundary[type] << (parts[0].hex..parts[1].hex) 68 else 69 @ucd.boundary[type] << $1.hex 70 end 71 end 72 end 73 74 def parse_composition_exclusion(line) 75 if line =~ /^([0-9A-F]+)/i 76 @ucd.composition_exclusion << $1.hex 77 end 78 end 79 80 def create_composition_map 81 @ucd.codepoints.each do |_, cp| 82 if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) 83 @ucd.composition_map[cp.decomp_mapping[0]] ||= {} 84 @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code 85 end 86 end 87 end 88 89 def normalize_boundary_map 90 @ucd.boundary.each do |k,v| 91 if [:lf, :cr].include? k 92 @ucd.boundary[k] = v[0] 93 end 94 end 95 end 96 97 def parse 98 SOURCES.each do |type, url| 99 filename = "/tmp/#{url.split('/').last}" 100 unless File.exist?(filename) 101 $stderr.puts "Downloading #{url.split('/').last}" 102 File.open(filename, 'w') do |target| 103 open(url) do |source| 104 source.each_line { |line| target.write line } 105 end 106 end 107 end 108 File.open(filename) do |file| 109 file.each_line { |line| send "parse_#{type}".intern, line } 110 end 111 end 112 create_composition_map 113 normalize_boundary_map 114 end 115 116 def dump_to(filename) 117 File.open(filename, 'w') do |f| 118 f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary]) 119 end 120 end 121 end 122 end 123 124 if __FILE__ == $0 125 filename = ActiveSupport::Multibyte::Handlers::UnicodeDatabase.filename 126 generator = ActiveSupport::Multibyte::Handlers::UnicodeTableGenerator.new 127 generator.parse 128 print "Writing to: #{filename}" 129 generator.dump_to filename 130 puts " (#{File.size(filename)} bytes)" 131 end -
activesupport/lib/active_support.rb
old new 40 40 require 'active_support/values/time_zone' 41 41 42 42 require 'active_support/json' 43 44 require 'active_support/multibyte'