package JEncode; #============================================================================== # JEncode - Wrapper to Encode providing some Jcode methods # # (c) Makama[at]donzoko.net (http://www.donzoko.net/) # # 2003-09-12 v.0.01 # 2003-09-13 v.0.02 add mime_encode,mime_decode,tr # 2003-11-18 v.0.03 add jfold (from Jcode.pm) # 2003-11-19 v.0.04 delete some meaningless codes # 2003-11-20 v.0.05 modify mime_decode() with utf8::encode() # 2003-12-06 v.1.00 modify tr() correctly, & change jlength's code # 2003-12-07 v.1.01 delete guess_encodeing's params # v.1.02 modify tr() to return $self # v.1.03 change jfold()'s code & add 'KINSOKU SYORI' # v.1.04 modify code of jlength(), jfold() and tr(). # 2003-12-18 v.1.05 add match() & s() which does not exist in Jcode.pm # 2003-12-21 v.1.06 modify Alais->{ascii} & add m() [alias to match()] # 2003-12-22 v.1.07 modify icode() # 2004-01-17 v.1.08 modify func_getcode() & const ambiguous for guessing code # 2004-01-18 v.1.09 modify _sjis_or_euc # 2004-01-18 v.1.10 add ucs2() # v.1.11 remake _sjis_or_euc() to _ambiguous() # 2004-01-20 v.1.12 modify s() for interpolating qr object # 2004-01-21 v.1.13 modify critical misstakes ( set(), append() ) # use Encode::Alias and rearrange constant Alias # 2004-01-22 v.1.14 split a code ref maker from s() for efficiency # 2004-01-23 v.1.15 remove eval block in _convert() for efficiency # 2004-01-24 v.1.16 remove meaningless codes from conversion methods # 2004-02-06 v.1.17 modify convert() # 2004-04-11 v.1.18 modify _ambiguous() and _test_mojicode() # 2004-04-12 v.1.19 modify _test_mojicode() for more exact result # 2004-04-14 v.1.20 modify tr(), s() and _test_mojicode() # v.1.21 modify _test_mojicode() for more exact result # 2004-04-15 v.1.22 modify _test_mojicode() for more exact result # 2004-04-16 v.1.23 modify _test_mojicode() for \p block # 2004-09-27 v.1.30 mime_encode() with iso-2022-jp *EXPERIMENTAL* # 2004-09-27 v.1.31 modify misstake of mime_encode() & mime_decode() # 2004-10-02 v.1.32 add is_utf8 check and $JEncode::UTF8 function # 2004-11-04 v.1.33 mime_encode() with iso-2022-jp *RELEASE* # 2004-11-07 v.1.34 modify E::M::H::ISO_2022_JP::_encode # delete Encode::FB_QUIET in _convert() # # * Note * # Internal storing charcters are 'euc-jp' same as Jcode.pm. # This has three functions - jcode(), getcodem(), and JEncdoe::convert. # jfold()'s code owes for Jcode.pm (by Dan Kogai) # # * TODO * the object will store string data as utf8 instead of euc-jp. # # Jcode.pm http://openlab.ring.gr.jp/Jcode/Jcode-0.83.tar.gz # http://search.cpan.org/~dankogai/Jcode-0.83/Jcode.pm # Jcode's information http://openlab.ring.gr.jp/Jcode/ # # Copyright : as Perl's license # #============================================================================== use strict; use Carp; use Encode; use Encode::Alias; use Encode::Guess; use Encode::JP::H2Z; use base qw(Exporter); our @EXPORT = qw(jcode getcode); use constant Alias => { 'shiftjis' => 'sjis', 'euc-jp' => 'euc', '7bit-jis' => 'jis', }; define_alias( euc => 'euc-jp', sjis => 'shiftjis', jis => '7bit-jis', ); our $VERSION = '1.34'; our $DEBUG = 0; our $UTF8 = 0; our $MIME_HEADER_ISO2022JP = 0; use overload ( q("") => sub { $UTF8 ? $_[0]->utf8 : $_[0]->euc }, q(==) => sub { overload::StrVal($_[0]) eq overload::StrVal($_[1]) }, q(=) => sub { $_[0]->set( $_[1] ) }, q(.=) => sub { $_[0]->append( $_[1] ) }, fallback => 1, ); ####################################### # Setting ####################################### sub new{ my $class = shift; my $self = {}; bless($self,$class); unless(defined $_[0]){ $_[0] = ''; } $self->_init(@_); } sub _init{ my $self = shift; my $str = $_[0]; my $r_str = (ref $str) ? $str : \$str; my $code = $_[1] if(defined $_[1]); utf8::encode(${$r_str}) if( Encode::is_utf8(${$r_str}) ); $self->{icode} = $code || _func_getcode($r_str) || 'euc'; convert($r_str,'euc',$self->{icode}); $self->{r_str} = $r_str; $self->{nmatch} = 0; $self; } sub set{ shift->_init(@_) } sub append{ my $self = shift; my $str = $_[0]; my $r_str = (ref $str) ? $str : \$str; my $code = $_[1] || _func_getcode($r_str) || 'euc'; _convert($r_str,$code,'euc'); ${ $self->{r_str} } .= ${$r_str}; return $self; } ####################################### # Instance Variables ####################################### sub r_str { $_[0]->{r_str}; } sub icode { $_[0]->{icode}; } sub nmatch{ $_[0]->{nmatch}; } ####################################### # Conversion ####################################### sub euc{ return ${$_[0]->{r_str}}; } sub jis{ return _get_converted($_[0]->{r_str},'7bit-jis'); } sub sjis{ return _get_converted($_[0]->{r_str},'shiftjis'); } sub iso_2022_jp{ return _get_converted($_[0]->{r_str},'iso-2022-jp'); } sub utf8{ return $UTF8 ? decode('euc-jp', ${$_[0]->{r_str}}) : _get_converted($_[0]->{r_str},'utf8'); } sub ucs2{ return _get_converted($_[0]->{r_str},'ucs2'); } ####################################### # Length, Translation and Fold ####################################### sub jlength{ length( decode('euc-jp', $_[0]->euc) ); } sub tr{ my $self = shift; my $str = decode('euc-jp', $self->euc); my $from = decode('euc-jp', $_[0]); my $to = decode('euc-jp', $_[1]); my $match = eval qq{ \$str =~ tr/$from/$to/ }; utf8::encode($str); convert($str,'euc','utf8'); ${$self->{r_str}} = $str; $self->{nmatch} = $match || 0; return $self; } sub jfold{ # Original code is from Dan Kogai's Jcode.pm & its subclass. my $self = shift; my $str = decode('euc-jp', $self->euc); my ($bpl,$nl,$kin) = @_; $bpl ||= 72; $nl ||= "\n"; my (@lines, $len, $i, $letter, %kinsoku); if( defined $kin and (ref $kin) eq 'ARRAY' ){ %kinsoku = map { ($_, 1) } @{$kin}; } while($str =~ m/(.)/sg){ $letter = $1; utf8::encode($letter); convert(\$letter,'euc','utf8'); if ($len + length($letter) > $bpl){ unless($kinsoku{$letter}){ $i++; $len = 0; } } $lines[$i] .= $letter; $len += length($letter); } $lines[$i] or pop @lines; $str = join($nl, @lines); $self->{r_str} = \$str; return wantarray ? @lines : $self; } ####################################### # MIME-Encoding ####################################### # 'MIME-Header-ISO_2022_JP' is defined in this file. sub mime_encode{ my $self = shift; my $utf8 = decode('euc-jp', $self->euc); my $type = $MIME_HEADER_ISO2022JP ? 'MIME-Header-ISO_2022_JP' : 'MIME-Header'; return Encode::encode($type, $utf8); } sub mime_decode{ my $self = shift; my $utf8 = Encode::decode('MIME-Header',${ $self->r_str }); utf8::encode($utf8); $self->set($utf8,'utf8'); } ####################################### # Full and Half ####################################### sub h2z{ _z_h($_[0]->{r_str},$_[0]->{icode},'z',$_[1]); return $_[0]; } sub z2h{ _z_h($_[0]->{r_str},$_[0]->{icode},'h'); return $_[0]; } ####################################### # Functions ####################################### sub jcode { return __PACKAGE__->new(@_); } sub getcode{ my $r_str = (ref $_[0]) ? $_[0] : \$_[0]; return _func_getcode($r_str); } sub convert{ # __PACKAGE__::convert($r_str,$ocode,$icode,$opt); my $r_str = (ref $_[0]) ? $_[0] : \$_[0]; my (undef,$ocode,$icode,$opt) = @_; utf8::encode($$r_str) if( Encode::is_utf8($$r_str) ); if(!defined $icode){ $icode = getcode($r_str) || return; } if(defined $opt){ _z_h($r_str,$icode,$opt); } _convert($r_str,$icode,$ocode); return ${$r_str}; } ####################################### # Matching ####################################### sub m{ match(@_); } sub match{ my $self = shift; my $str = decode('euc-jp', $self->euc); my $regex = $self->_make_regex($_[0],$_[1]); unless( wantarray ){ $str =~ /$regex/; } else{ my @array = $str =~ /$regex/; for( @array ){ utf8::encode($_); convert(\$_,'euc','utf8'); } return @array; } } sub s{ my $self = shift; my $str = decode('euc-jp', $self->euc); $self->_get_s(@_)->(\$str); utf8::encode($str); convert(\$str,'euc','utf8'); ${$self->{r_str}} = $str; $self; } sub _get_s{ my ($self,$pattern,$replace,$opt,$g) = @_; $opt ||= ''; $g ||= ''; if(defined $replace){ $pattern = $self->_make_regex($pattern,$opt); $replace = decode('euc-jp', $replace); $self->{_s} = eval qq{ sub { my \$r_str = shift; \${\$r_str} =~ s/\$pattern/$replace/$g; } }; croak $! if($@); } return $self->{_s} || croak "s code ref is not defined."; } sub _make_regex{ my $self = shift; return $self->{_regex} unless(defined $_[0]); my $pat = decode('euc-jp', $_[0]); my $opt = $_[1] || ''; my $regex = eval qq{ qr/\$pat/$opt }; croak $! if($@); $DEBUG and carp ">", "make regex: $regex"; return $self->{_regex} = $regex; } ####################################### # Internal function # r_str is ref & icode is for Encode ####################################### sub _z_h{ my ($r_str,$icode,$mode,$opt) = @_; if ($mode eq 'z'){ Encode::JP::H2Z::h2z($r_str,$opt); } elsif($mode eq 'h'){ Encode::JP::H2Z::z2h($r_str); } } sub _get_converted{ # return a converted string but not change original my ($r_str,$ocode) = @_; my $str = (ref $r_str) ? $$r_str : $r_str; _convert(\$str,'euc-jp',$ocode); return $str; } sub _convert{ # return the number of converted charactor my ($r_str,$icode,$ocode) = @_; return if($icode eq $ocode); Encode::from_to($$r_str,$icode,$ocode); } sub _func_getcode{ my $r_str = shift; my $enc = guess_encoding($$r_str, qw/euc-jp shiftjis 7bit-jis/) || ''; if(ref $enc){ return defined Alias->{$enc->name} ? Alias->{$enc->name} : $enc->name; } elsif($enc =~ / or /){ return _ambiguous($enc,$r_str); } else{ return undef; } } sub _ambiguous{ # This is so unsuitable that there is a need to remake. my $enc = shift; my %code = map{ ($_,1) } split/ or /,$enc; my ($euc,$sjis,$utf8) = (0,0,0); $euc = _test_mojicode($_[0],'euc-jp') if($code{'euc-jp'}); $sjis = _test_mojicode($_[0],'shiftjis') if($code{'shiftjis'}); $utf8 = _test_mojicode($_[0],'utf8') if($code{'utf8'}); $DEBUG and carp ">$enc: euc-$euc sjis-$sjis utf8-$utf8"; return ($utf8) ? 'utf8' : ($euc > $sjis) ? 'euc' : ($sjis > $euc) ? 'sjis' : ($euc) ? 'euc' : undef; } sub _test_mojicode{ my ($r_str,$code) = @_; my $str = decode($code, $$r_str) or die; my $nmatch = 0; $nmatch += ($code eq 'euc-jp') ? length($1) * 2 : length($1) while $str =~ m/((?: \p{InHiragana} | \p{InKatakana} | \p{InCJKUnifiedIdeographs} | \p{InHalfwidthAndFullwidthForms} | \p{InCJKSymbolsAndPunctuation} | \p{InArrows} | \p{InGeneralPunctuation} | \p{InGeometricShapes} | \p{InMiscellaneousSymbols} | \p{InMathematicalOperators} | \p{InGreekandCoptic} | \p{InCyrillic} | \p{InBoxDrawing} )+)/gx; return $nmatch; } ####################################### # # Encode::MIME::Header::ISO_2022_JP # ####################################### package Encode::MIME::Header::ISO_2022_JP; use strict; use base qw(Encode::MIME::Header); $Encode::Encoding{'MIME-Header-ISO_2022_JP'} = bless {encode => 'B', bpl => 75, Name => 'MIME-Header-ISO_2022_JP'} => __PACKAGE__; use constant HEAD => '=?ISO-2022-JP?'; use constant TAIL => '?='; use constant SINGLE => { B => \&_encode_b }; sub encode { my $self = shift; no strict 'refs'; local *Encode::MIME::Header::_encode = \&_encode; $self->SUPER::encode(@_); } # I wanted to rewrite only _encode_b(), but that function is called # from its coderef in constant SINGLE of superclass so that couldn't # be handled. So I had to copy and modify a source around _encode(). sub _encode{ # COPY of Encode::MIME::Header 2.0 my ($o, $str) = @_; my $enc = $o->{encode}; my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL)); # to coerce a floating-point arithmetics, the following contains # .0 in numbers -- dankogai $llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0; my @result = (); # -- JEncode original my $i = 8; while($i++ <= length($str)){ use bytes (); if (bytes::length( Encode::encode('iso-2022-jp',substr($str, 0, $i)) ) > $llen){ push @result, SINGLE->{$enc}(substr($str, 0, $i-1,'')); $i = 8; } } $str and push @result, SINGLE->{$enc}(Encode::encode('iso-2022-jp',$str)); return @result; } sub _encode_b{ my $str = shift; utf8::encode($str) if( Encode::is_utf8($str) ); Encode::from_to($str, 'utf8', 'iso-2022-jp'); HEAD . 'B?' . MIME::Base64::encode_base64($str, '') . TAIL; } #============================================================================== 1; __END__ =pod =head1 NAME JEncode - JEncode - Wrapper to Encode providing some Jcode methods =head1 SYNOPSIS use JEncode; print JEncode->new($str)->utf8; JEncode::convert(\$str, $ocode, $icode, "z"); =head1 DESCRIPTION JEncode is Wrapper to Encode. This module provides a number of methods same as Jcode. =head1 Methods =over 4 =item $j = JEncode-Enew($str [, $icode]); See Jcode's document. =item $j-Eset($str [, $icode]); See Jcode's document. =item $j-Eappend($str [, $icode]); See Jcode's document. =item $j = jcode($str [, $icode]); See Jcode's document. =item $euc = $j-Eeuc; See Jcode's document. =item $jis = $j-Ejis; See Jcode's document. =item $sjis = $j-Esjis; See Jcode's document. =item $iso_2022_jp = $j-Eiso_2022_jp See Jcode's document. =item $length = $jcode-Ejlength(); See Jcode's document. =item $mime_header = $j-Emime_encode(); This method encode a header with UTF8. It doesn't take any arguments. After version 1.30 and more, if you set $JEncode::MIME_HEADER_ISO2022JP with true value, you can mime-encode with ISO-2022-JP. =item $j-Emime_decode; See Jcode's document. =item $j-Eh2z([$keep_dakuten]); See Jcode's document. =item $j-Ez2h; See Jcode's document. =item $j-Etr($from, $to); See Jcode's document. =item $j-Ejfold($bytes_per_line, $newline_str, $kinsoku_arrayref); All parameters are optional. First and second arguments are same as Jcode.pm. Third is for Kinsoku Syori. This is array ref of kinsoku words which contains EUC Japanese. =back =head1 Instance Variables =over 4 =item $j-Er_str See Jcode's document. =item $j-Eicode Current charactor code =item $j-Enmatch Number of matches (Only in $j->tr) =back =head1 Original Methods =over 4 =item $j-Ematch($pattern,[imsx]) This method is JEncode original. $pattern is EUC Japanese. An instance caches pattern. So If not $pattern, previous pattern is used. See m/// about its return value. =item $j-Es($pattern,$replacement,[imsx],[g]) This method is JEncode original. $pattern and $replacement are EUC Japanese. An instance caches pattern. So If not $pattern, previous pattern is used. print $j->set('foohogefoo')->s('foo','bar',undef,'g')->sjis; =back =head1 Subroutines =over 4 =item $code = getcode($str); not return nmatch =item Jcode::convert($str, [$ocode, $icode, $opt]); =back =head1 SEE ALSO Jcode, Encode, Encode::Guess =cut