class Encoding::Converter
Encoding
转换类。
常量
- AFTER_OUTPUT
在完成某些输出后,但在消耗所有输入之前停止转换。有关示例,请参见
primitive_convert
。- CRLF_NEWLINE_DECORATOR
用于将 LF 转换为 CRLF 的装饰器
- CR_NEWLINE_DECORATOR
用于将 LF 转换为 CR 的装饰器
- INVALID_MASK
无效字节序列的掩码
- INVALID_REPLACE
替换无效字节序列
- LF_NEWLINE_DECORATOR
用于在写入时将 CRLF 和 CR 转换为 LF 的装饰器
- PARTIAL_INPUT
指示源可能是较大字符串的一部分。有关示例,请参见
primitive_convert
。- UNDEF_HEX_CHARREF
用 XML 十六进制字符引用替换目标编码中未定义的字节序列。这对于 XML 转换有效。
- UNDEF_MASK
用于表示源编码中有效的字符,但在目标编码中没有相关字符的掩码。
- UNDEF_REPLACE
替换目标编码中未定义的字节序列。
- UNIVERSAL_NEWLINE_DECORATOR
用于将 CRLF 和 CR 转换为 LF 的装饰器
- XML_ATTR_CONTENT_DECORATOR
转义为 XML AttValue
- XML_ATTR_QUOTE_DECORATOR
转义为 XML AttValue
- XML_TEXT_DECORATOR
转义为 XML CharData
公共类方法
返回相应的 ASCII 兼容编码。
如果参数是 ASCII 兼容编码,则返回 nil。
“相应的 ASCII 兼容编码”是一种 ASCII 兼容编码,它可以准确地表示与给定 ASCII 不兼容编码相同的字符。因此,在两个编码之间进行转换时,不会发生转换未定义错误。
Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg) { const char *arg_name, *result_name; rb_encoding *arg_enc, *result_enc; enc_arg(&arg, &arg_name, &arg_enc); result_name = rb_econv_asciicompat_encoding(arg_name); if (result_name == NULL) return Qnil; result_enc = make_encoding(result_name); return rb_enc_from_encoding(result_enc); }
可能的选项元素
hash form: :invalid => nil # raise error on invalid byte sequence (default) :invalid => :replace # replace invalid byte sequence :undef => nil # raise error on undefined conversion (default) :undef => :replace # replace undefined conversion :replace => string # replacement string ("?" or "\uFFFD" if not specified) :newline => :universal # decorator for converting CRLF and CR to LF :newline => :lf # decorator for converting CRLF and CR to LF when writing :newline => :crlf # decorator for converting LF to CRLF :newline => :cr # decorator for converting LF to CR :universal_newline => true # decorator for converting CRLF and CR to LF :crlf_newline => true # decorator for converting LF to CRLF :cr_newline => true # decorator for converting LF to CR :lf_newline => true # decorator for converting CRLF and CR to LF when writing :xml => :text # escape as XML CharData. :xml => :attr # escape as XML AttValue integer form: Encoding::Converter::INVALID_REPLACE Encoding::Converter::UNDEF_REPLACE Encoding::Converter::UNDEF_HEX_CHARREF Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR Encoding::Converter::LF_NEWLINE_DECORATOR Encoding::Converter::CRLF_NEWLINE_DECORATOR Encoding::Converter::CR_NEWLINE_DECORATOR Encoding::Converter::XML_TEXT_DECORATOR Encoding::Converter::XML_ATTR_CONTENT_DECORATOR Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
Encoding::Converter.new
创建 Encoding::Converter
的实例。
Source_encoding 和 destination_encoding
应该是一个字符串或 Encoding
对象。
opt 应该是 nil,一个哈希或一个整数。
convpath 应该是一个数组。convpath 可以包含
-
包含编码或编码名称的两个元素数组,或
-
表示装饰器名称的字符串。
Encoding::Converter.new
可以选择接受一个选项。该选项应该是一个哈希或一个整数。选项哈希可以包含 :invalid => nil 等。选项整数应该是常量(例如 Encoding::Converter::INVALID_REPLACE
等)的逻辑或。
- :invalid => nil
-
在无效的字节序列上引发错误。这是默认行为。
- :invalid => :replace
-
用替换字符串替换无效的字节序列。
- :undef => nil
-
如果
source_encoding
中的字符未在 destination_encoding 中定义,则引发错误。这是默认行为。 - :undef => :replace
-
用替换字符串替换
destination_encoding
中未定义的字符。 - :replace => string
-
指定替换字符串。如果未指定,Unicode 编码使用“uFFFD”,其他编码使用“?”。
- :universal_newline => true
-
将 CRLF 和 CR 转换为 LF。
- :crlf_newline => true
-
将 LF 转换为 CRLF。
- :cr_newline => true
-
将 LF 转换为 CR。
- :lf_newline => true
-
将 CRLF 和 CR 转换为 LF(写入时)。
- :xml => :text
-
转义为 XML CharData。此形式可以用作 HTML 4.0 PCDATA。
-
‘&’ -> ‘&’
-
‘<’ -> ‘<’
-
‘>’ -> ‘>’
-
destination_encoding
中未定义的字符 -> 十六进制 CharRef,例如 &#xHH;
-
- :xml => :attr
-
转义为 XML AttValue。转换后的结果用“…”引用。此形式可以用作 HTML 4.0 属性值。
-
‘&’ -> ‘&’
-
‘<’ -> ‘<’
-
‘>’ -> ‘>’
-
‘“’ -> ‘"’
-
destination_encoding
中未定义的字符 -> 十六进制 CharRef,例如 &#xHH;
-
示例
# UTF-16BE to UTF-8 ec = Encoding::Converter.new("UTF-16BE", "UTF-8") # Usually, decorators such as newline conversion are inserted last. ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>], # "universal_newline"] # But, if the last encoding is ASCII incompatible, # decorators are inserted before the last conversion. ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) p ec.convpath #=> ["crlf_newline", # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] # Conversion path can be specified directly. ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) p ec.convpath #=> ["universal_newline", # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
static VALUE econv_init(int argc, VALUE *argv, VALUE self) { VALUE ecopts; VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; rb_econv_t *ec; int ecflags; VALUE convpath; if (rb_check_typeddata(self, &econv_data_type)) { rb_raise(rb_eTypeError, "already initialized"); } if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) { ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc); ecflags = 0; ecopts = Qnil; } else { econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); ec = rb_econv_open_opts(sname, dname, ecflags, ecopts); } if (!ec) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } if (!DECORATOR_P(sname, dname)) { if (!senc) senc = make_dummy_encoding(sname); if (!denc) denc = make_dummy_encoding(dname); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); } ec->source_encoding = senc; ec->destination_encoding = denc; DATA_PTR(self) = ec; return self; }
返回转换路径。
p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) or p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], # "universal_newline"] p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) or p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # "universal_newline", # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) { VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; int ecflags; VALUE ecopts; VALUE convpath; econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); convpath = Qnil; transcode_search_path(sname, dname, search_convpath_i, &convpath); if (NIL_P(convpath)) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } if (decorate_convpath(convpath, ecflags) == -1) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } return convpath; }
公共实例方法
static VALUE econv_equal(VALUE self, VALUE other) { rb_econv_t *ec1 = check_econv(self); rb_econv_t *ec2; int i; if (!rb_typeddata_is_kind_of(other, &econv_data_type)) { return Qnil; } ec2 = DATA_PTR(other); if (!ec2) return Qfalse; if (ec1->source_encoding_name != ec2->source_encoding_name && strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) return Qfalse; if (ec1->destination_encoding_name != ec2->destination_encoding_name && strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) return Qfalse; if (ec1->flags != ec2->flags) return Qfalse; if (ec1->replacement_enc != ec2->replacement_enc && strcmp(ec1->replacement_enc, ec2->replacement_enc)) return Qfalse; if (ec1->replacement_len != ec2->replacement_len) return Qfalse; if (ec1->replacement_str != ec2->replacement_str && memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) return Qfalse; if (ec1->num_trans != ec2->num_trans) return Qfalse; for (i = 0; i < ec1->num_trans; i++) { if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder) return Qfalse; } return Qtrue; }
转换 source_string 并返回 destination_string。
假设 source_string 是源的一部分。即,内部指定了 :partial_input=>true。最后应该使用 finish 方法。
ec = Encoding::Converter.new("utf-8", "euc-jp") puts ec.convert("\u3042").dump #=> "\xA4\xA2" puts ec.finish.dump #=> "" ec = Encoding::Converter.new("euc-jp", "utf-8") puts ec.convert("\xA4").dump #=> "" puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" puts ec.finish.dump #=> "" ec = Encoding::Converter.new("utf-8", "iso-2022-jp") puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
如果发生转换错误,则会引发 Encoding::UndefinedConversionError
或 Encoding::InvalidByteSequenceError
。Encoding::Converter#convert
不提供从这些异常中恢复或重新启动的方法。当您想要处理这些转换错误时,请使用 Encoding::Converter#primitive_convert
。
static VALUE econv_convert(VALUE self, VALUE source_string) { VALUE ret, dst; VALUE av[5]; int ac; rb_econv_t *ec = check_econv(self); StringValue(source_string); dst = rb_str_new(NULL, 0); av[0] = rb_str_dup(source_string); av[1] = dst; av[2] = Qnil; av[3] = Qnil; av[4] = INT2NUM(ECONV_PARTIAL_INPUT); ac = 5; ret = econv_primitive_convert(ac, av, self); if (ret == sym_invalid_byte_sequence || ret == sym_undefined_conversion || ret == sym_incomplete_input) { VALUE exc = make_econv_exception(ec); rb_exc_raise(exc); } if (ret == sym_finished) { rb_raise(rb_eArgError, "converter already finished"); } if (ret != sym_source_buffer_empty) { rb_bug("unexpected result of econv_primitive_convert"); } return dst; }
返回 ec 的转换路径。
结果是转换数组。
ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) p ec.convpath #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], # "crlf_newline"]
数组的每个元素都是一对编码或一个字符串。一对表示编码转换。字符串表示装饰器。
在上面的示例中,[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] 表示从 ISO-8859-1 到 UTF-8 的转换器。“crlf_newline” 表示从 LF 到 CRLF 的换行符转换器。
static VALUE econv_convpath(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE result; int i; result = rb_ary_new(); for (i = 0; i < ec->num_trans; i++) { const rb_transcoder *tr = ec->elems[i].tc->transcoder; VALUE v; if (DECORATOR_P(tr->src_encoding, tr->dst_encoding)) v = rb_str_new_cstr(tr->dst_encoding); else v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding)); rb_ary_push(result, v); } return result; }
以 Encoding
对象形式返回目标编码。
static VALUE econv_destination_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); return econv_get_encoding(ec->destination_encoding); }
完成转换器。它返回转换后的字符串的最后部分。
ec = Encoding::Converter.new("utf-8", "iso-2022-jp") p ec.convert("\u3042") #=> "\e$B$\"" p ec.finish #=> "\e(B"
static VALUE econv_finish(VALUE self) { VALUE ret, dst; VALUE av[5]; int ac; rb_econv_t *ec = check_econv(self); dst = rb_str_new(NULL, 0); av[0] = Qnil; av[1] = dst; av[2] = Qnil; av[3] = Qnil; av[4] = INT2FIX(0); ac = 5; ret = econv_primitive_convert(ac, av, self); if (ret == sym_invalid_byte_sequence || ret == sym_undefined_conversion || ret == sym_incomplete_input) { VALUE exc = make_econv_exception(ec); rb_exc_raise(exc); } if (ret != sym_finished) { rb_bug("unexpected result of econv_primitive_convert"); } return dst; }
将 string 插入到编码转换器中。该字符串将转换为目标编码,并在以后的转换中输出。
如果目标编码是有状态的,则根据状态转换字符串并更新状态。
仅当发生转换错误时才应使用此方法。
ec = Encoding::Converter.new("utf-8", "iso-8859-1") src = "HIRAGANA LETTER A is \u{3042}." dst = "" p ec.primitive_convert(src, dst) #=> :undefined_conversion puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] ec.insert_output("<err>") p ec.primitive_convert(src, dst) #=> :finished puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""] ec = Encoding::Converter.new("utf-8", "iso-2022-jp") src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp dst = "" p ec.primitive_convert(src, dst) #=> :undefined_conversion puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] ec.insert_output "?" # state change required to output "?". p ec.primitive_convert(src, dst) #=> :finished puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
static VALUE econv_insert_output(VALUE self, VALUE string) { const char *insert_enc; int ret; rb_econv_t *ec = check_econv(self); StringValue(string); insert_enc = rb_econv_encoding_to_insert_output(ec); string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); if (ret == -1) { rb_raise(rb_eArgError, "too big string"); } return Qnil; }
返回 ec 的可打印版本
ec = Encoding::Converter.new("iso-8859-1", "utf-8") puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
static VALUE econv_inspect(VALUE self) { const char *cname = rb_obj_classname(self); rb_econv_t *ec; TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); if (!ec) return rb_sprintf("#<%s: uninitialized>", cname); else { const char *sname = ec->source_encoding_name; const char *dname = ec->destination_encoding_name; VALUE str; str = rb_sprintf("#<%s: ", cname); econv_description(sname, dname, ec->flags, str); rb_str_cat2(str, ">"); return str; } }
返回上次转换的异常对象。如果上次转换没有产生错误,则返回 nil。
“错误”表示 Encoding::InvalidByteSequenceError
和 Encoding::UndefinedConversionError
用于 Encoding::Converter#convert
和 :invalid_byte_sequence、:incomplete_input 和 :undefined_conversion 用于 Encoding::Converter#primitive_convert
。
ec = Encoding::Converter.new("utf-8", "iso-8859-1") p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full p ec.last_error #=> nil
static VALUE econv_last_error(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE exc; exc = make_econv_exception(ec); if (NIL_P(exc)) return Qnil; return exc; }
可能的 opt 元素
hash form: :partial_input => true # source buffer may be part of larger source :after_output => true # stop conversion after output before input integer form: Encoding::Converter::PARTIAL_INPUT Encoding::Converter::AFTER_OUTPUT
可能的结果
:invalid_byte_sequence :incomplete_input :undefined_conversion :after_output :destination_buffer_full :source_buffer_empty :finished
primitive_convert
将 source_buffer 转换为 destination_buffer。
source_buffer 应该是一个字符串或 nil。nil 表示空字符串。
destination_buffer 应该是一个字符串。
destination_byteoffset 应该是一个整数或 nil。nil 表示 destination_buffer 的末尾。如果省略,则假设为 nil。
destination_bytesize 应该是一个整数或 nil。nil 表示无限制。如果省略,则假设为 nil。
opt 应该为 nil、哈希或整数。nil 表示没有标志。如果省略,则假设为 nil。
primitive_convert
从头开始转换 source_buffer 的内容,并将结果存储到 destination_buffer 中。
destination_byteoffset 和 destination_bytesize 指定存储转换结果的区域。destination_byteoffset 以字节为单位指定 destination_buffer 中的起始位置。如果 destination_byteoffset 为 nil,则使用 destination_buffer.bytesize 来追加结果。destination_bytesize 指定最大字节数。如果 destination_bytesize 为 nil,则目标大小不受限制。转换后,destination_buffer 的大小将调整为 destination_byteoffset + 实际生成的字节数。此外,destination_buffer 的编码设置为 destination_encoding。
primitive_convert
删除 source_buffer 的转换部分。删除的部分在 destination_buffer 中转换或缓存在 Encoding::Converter
对象中。
当满足以下条件之一时,primitive_convert
会停止转换。
-
在源缓冲区中找到无效的字节序列 (:invalid_byte_sequence)
primitive_errinfo
和last_error
方法返回错误的详细信息。 -
源缓冲区意外结束 (:incomplete_input) 仅当未指定 :partial_input 时才会发生此情况。
primitive_errinfo
和last_error
方法返回错误的详细信息。 -
字符在输出编码中不可表示 (:undefined_conversion)
primitive_errinfo
和last_error
方法返回错误的详细信息。 -
在生成一些输出后,在完成输入之前 (:after_output) 仅当指定了 :after_output 时才会发生此情况。
-
目标缓冲区已满 (:destination_buffer_full) 仅当 destination_bytesize 为非 nil 时才会发生此情况。
-
源缓冲区为空 (:source_buffer_empty) 仅当指定了 :partial_input 时才会发生此情况。
-
转换已完成 (:finished)
示例
ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 100) p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:finished, "", "i"]
static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self) { VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v; rb_econv_t *ec = check_econv(self); rb_econv_result_t res; const unsigned char *ip, *is; unsigned char *op, *os; long output_byteoffset, output_bytesize; unsigned long output_byteend; int flags; argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt); if (NIL_P(output_byteoffset_v)) output_byteoffset = 0; /* dummy */ else output_byteoffset = NUM2LONG(output_byteoffset_v); if (NIL_P(output_bytesize_v)) output_bytesize = 0; /* dummy */ else output_bytesize = NUM2LONG(output_bytesize_v); if (!NIL_P(flags_v)) { if (!NIL_P(opt)) { rb_error_arity(argc + 1, 2, 5); } flags = NUM2INT(rb_to_int(flags_v)); } else if (!NIL_P(opt)) { VALUE v; flags = 0; v = rb_hash_aref(opt, sym_partial_input); if (RTEST(v)) flags |= ECONV_PARTIAL_INPUT; v = rb_hash_aref(opt, sym_after_output); if (RTEST(v)) flags |= ECONV_AFTER_OUTPUT; } else { flags = 0; } StringValue(output); if (!NIL_P(input)) StringValue(input); rb_str_modify(output); if (NIL_P(output_bytesize_v)) { output_bytesize = rb_str_capacity(output); if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) output_bytesize = RSTRING_LEN(input); } retry: if (NIL_P(output_byteoffset_v)) output_byteoffset = RSTRING_LEN(output); if (output_byteoffset < 0) rb_raise(rb_eArgError, "negative output_byteoffset"); if (RSTRING_LEN(output) < output_byteoffset) rb_raise(rb_eArgError, "output_byteoffset too big"); if (output_bytesize < 0) rb_raise(rb_eArgError, "negative output_bytesize"); output_byteend = (unsigned long)output_byteoffset + (unsigned long)output_bytesize; if (output_byteend < (unsigned long)output_byteoffset || LONG_MAX < output_byteend) rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big"); if (rb_str_capacity(output) < output_byteend) rb_str_resize(output, output_byteend); if (NIL_P(input)) { ip = is = NULL; } else { ip = (const unsigned char *)RSTRING_PTR(input); is = ip + RSTRING_LEN(input); } op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset; os = op + output_bytesize; res = rb_econv_convert(ec, &ip, is, &op, os, flags); rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output)); if (!NIL_P(input)) { rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input)); } if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) { if (LONG_MAX / 2 < output_bytesize) rb_raise(rb_eArgError, "too long conversion result"); output_bytesize *= 2; output_byteoffset_v = Qnil; goto retry; } if (ec->destination_encoding) { rb_enc_associate(output, ec->destination_encoding); } return econv_result_to_symbol(res); }
primitive_errinfo
将上次错误的有关重要信息作为包含 5 个元素的数组返回
[result, enc1, enc2, error_bytes, readagain_bytes]
result 是 primitive_convert 的最后一个结果。
其他元素仅当 result 为 :invalid_byte_sequence、:incomplete_input 或 :undefined_conversion 时才有意义。
enc1 和 enc2 指示作为字符串对的转换步骤。例如,从 EUC-JP 到 ISO-8859-1 的转换器按如下方式转换字符串:EUC-JP -> UTF-8 -> ISO-8859-1。因此,[enc1, enc2] 要么是 [“EUC-JP”, “UTF-8”],要么是 [“UTF-8”, “ISO-8859-1”]。
error_bytes 和 readagain_bytes 指示导致错误的字节序列。error_bytes 是丢弃的部分。readagain_bytes 是缓存在下次转换时重新读取的部分。
示例
# \xff is invalid as EUC-JP. ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") ec.primitive_convert(src="\xff", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""] # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. # Since this error is occur in UTF-8 to ISO-8859-1 conversion, # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) p ec.primitive_errinfo #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] # partial character is invalid ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10) p ec.primitive_errinfo #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by # partial characters. ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) p ec.primitive_errinfo #=> [:source_buffer_empty, nil, nil, nil, nil] # \xd8\x00\x00@ is invalid as UTF-16BE because # no low surrogate after high surrogate (\xd8\x00). # It is detected by 3rd byte (\00) which is part of next character. # So the high surrogate (\xd8\x00) is discarded and # the 3rd byte is read again later. # Since the byte is buffered in ec, it is dropped from src. ec = Encoding::Converter.new("UTF-16BE", "UTF-8") ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] p src #=> "@" # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. # The problem is detected by 4th byte. ec = Encoding::Converter.new("UTF-16LE", "UTF-8") ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] p src #=> ""
static VALUE econv_primitive_errinfo(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE ary; ary = rb_ary_new2(5); rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)); rb_ary_store(ary, 4, Qnil); if (ec->last_error.source_encoding) rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding)); if (ec->last_error.destination_encoding) rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding)); if (ec->last_error.error_bytes_start) { rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); } return ary; }
放回将被转换的字节。
字节由 invalid_byte_sequence 错误引起。发生 invalid_byte_sequence 错误时,会丢弃一些字节,并将一些字节缓冲以稍后转换。可以放回后者的字节。可以通过 Encoding::InvalidByteSequenceError#readagain_bytes
和 Encoding::Converter#primitive_errinfo
观察到它。
ec = Encoding::Converter.new("utf-16le", "iso-8859-1") src = "\x00\xd8\x61\x00" dst = "" p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] p ec.putback #=> "a\x00" p ec.putback #=> "" # no more bytes to put back
static VALUE econv_putback(int argc, VALUE *argv, VALUE self) { rb_econv_t *ec = check_econv(self); int n; int putbackable; VALUE str, max; if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) { n = rb_econv_putbackable(ec); } else { n = NUM2INT(max); putbackable = rb_econv_putbackable(ec); if (putbackable < n) n = putbackable; } str = rb_str_new(NULL, n); rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n); if (ec->source_encoding) { rb_enc_associate(str, ec->source_encoding); } return str; }
返回替换字符串。
ec = Encoding::Converter.new("euc-jp", "us-ascii") p ec.replacement #=> "?" ec = Encoding::Converter.new("euc-jp", "utf-8") p ec.replacement #=> "\uFFFD"
static VALUE econv_get_replacement(VALUE self) { rb_econv_t *ec = check_econv(self); int ret; rb_encoding *enc; ret = make_replacement(ec); if (ret == -1) { rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } enc = rb_enc_find(ec->replacement_enc); return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc); }
设置替换字符串。
ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) ec.replacement = "<undef>" p ec.convert("a \u3042 b") #=> "a <undef> b"
static VALUE econv_set_replacement(VALUE self, VALUE arg) { rb_econv_t *ec = check_econv(self); VALUE string = arg; int ret; rb_encoding *enc; StringValue(string); enc = rb_enc_get(string); ret = rb_econv_set_replacement(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), rb_enc_name(enc)); if (ret == -1) { /* xxx: rb_eInvalidByteSequenceError? */ rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } return arg; }
以 Encoding
对象的形式返回源编码。
static VALUE econv_source_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); return econv_get_encoding(ec->source_encoding); }