Skip to content

Commit

Permalink
zip: Simplify encoding config; add empty header matching
Browse files Browse the repository at this point in the history
  • Loading branch information
mholt committed Nov 19, 2024
1 parent 2932322 commit c1ebc84
Showing 1 changed file with 18 additions and 65 deletions.
83 changes: 18 additions & 65 deletions zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,12 @@ import (
"strings"

szip "github.com/STARRY-S/zip"
"golang.org/x/text/encoding"

"github.com/dsnet/compress/bzip2"
"github.com/klauspost/compress/zip"
"github.com/klauspost/compress/zstd"
"github.com/ulikunitz/xz"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
)

func init() {
Expand Down Expand Up @@ -80,7 +74,7 @@ type Zip struct {
// For files in zip archives that do not have UTF-8
// encoded filenames and comments, specify the character
// encoding here.
TextEncoding string
TextEncoding encoding.Encoding
}

func (z Zip) Extension() string { return ".zip" }
Expand All @@ -94,11 +88,16 @@ func (z Zip) Match(_ context.Context, filename string, stream io.Reader) (MatchR
}

// match file header
buf, err := readAtMost(stream, len(zipHeader))
if err != nil {
return mr, err
for _, hdr := range zipHeaders {
buf, err := readAtMost(stream, len(hdr))
if err != nil {
return mr, err
}
if bytes.Equal(buf, hdr) {
mr.ByStream = true
break
}
}
mr.ByStream = bytes.Equal(buf, zipHeader)

return mr, nil
}
Expand Down Expand Up @@ -255,13 +254,14 @@ func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, handleFile Fi
// It is a no-op if the text is already UTF-8 encoded or if z.TextEncoding
// is not specified.
func (z Zip) decodeText(hdr *zip.FileHeader) {
if hdr.NonUTF8 && z.TextEncoding != "" {
filename, err := decodeText(hdr.Name, z.TextEncoding)
if hdr.NonUTF8 && z.TextEncoding != nil {
dec := z.TextEncoding.NewDecoder()
filename, err := dec.String(hdr.Name)
if err == nil {
hdr.Name = filename
}
if hdr.Comment != "" {
comment, err := decodeText(hdr.Comment, z.TextEncoding)
comment, err := dec.String(hdr.Comment)
if err == nil {
hdr.Comment = comment
}
Expand Down Expand Up @@ -384,58 +384,11 @@ var compressedFormats = map[string]struct{}{
".zipx": {},
}

var encodings = map[string]encoding.Encoding{
"ibm866": charmap.CodePage866,
"iso8859_2": charmap.ISO8859_2,
"iso8859_3": charmap.ISO8859_3,
"iso8859_4": charmap.ISO8859_4,
"iso8859_5": charmap.ISO8859_5,
"iso8859_6": charmap.ISO8859_6,
"iso8859_7": charmap.ISO8859_7,
"iso8859_8": charmap.ISO8859_8,
"iso8859_8I": charmap.ISO8859_8I,
"iso8859_10": charmap.ISO8859_10,
"iso8859_13": charmap.ISO8859_13,
"iso8859_14": charmap.ISO8859_14,
"iso8859_15": charmap.ISO8859_15,
"iso8859_16": charmap.ISO8859_16,
"koi8r": charmap.KOI8R,
"koi8u": charmap.KOI8U,
"macintosh": charmap.Macintosh,
"windows874": charmap.Windows874,
"windows1250": charmap.Windows1250,
"windows1251": charmap.Windows1251,
"windows1252": charmap.Windows1252,
"windows1253": charmap.Windows1253,
"windows1254": charmap.Windows1254,
"windows1255": charmap.Windows1255,
"windows1256": charmap.Windows1256,
"windows1257": charmap.Windows1257,
"windows1258": charmap.Windows1258,
"macintoshcyrillic": charmap.MacintoshCyrillic,
"gbk": simplifiedchinese.GBK,
"gb18030": simplifiedchinese.GB18030,
"big5": traditionalchinese.Big5,
"eucjp": japanese.EUCJP,
"iso2022jp": japanese.ISO2022JP,
"shiftjis": japanese.ShiftJIS,
"euckr": korean.EUCKR,
"utf16be": unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
"utf16le": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
var zipHeaders = [][]byte{
[]byte("PK\x03\x04"), // normal
[]byte("PK\x05\x06"), // empty
}

// decodeText returns UTF-8 encoded text from the given charset.
// Thanks to @zxdvd for contributing non-UTF-8 encoding logic in
// #149, and to @pashifika for helping in #305.
func decodeText(input, charset string) (string, error) {
if enc, ok := encodings[charset]; ok {
return enc.NewDecoder().String(input)
}
return "", fmt.Errorf("unrecognized charset %s", charset)
}

var zipHeader = []byte("PK\x03\x04") // NOTE: headers of empty zip files might end with 0x05,0x06 or 0x06,0x06 instead of 0x03,0x04

// Interface guards
var (
_ Archiver = Zip{}
Expand Down

0 comments on commit c1ebc84

Please sign in to comment.