diff --git a/go.mod b/go.mod index d3514b95..efc0025d 100644 --- a/go.mod +++ b/go.mod @@ -53,6 +53,7 @@ require ( golang.org/x/sys v0.25.0 google.golang.org/grpc v1.66.1 google.golang.org/protobuf v1.34.2 + pgregory.net/rapid v1.1.0 ) require ( diff --git a/go.sum b/go.sum index bfae941a..b2e6db8b 100644 --- a/go.sum +++ b/go.sum @@ -572,4 +572,6 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= mvdan.cc/gofumpt v0.4.0 h1:JVf4NN1mIpHogBj7ABpgOyZc65/UUOkKQFkoURsz4MM= mvdan.cc/gofumpt v0.4.0/go.mod h1:PljLOHDeZqgS8opHRKLzp2It2VBuSdteAgqUfzMTxlQ= +pgregory.net/rapid v1.1.0 h1:CMa0sjHSru3puNx+J0MIAuiiEV4N0qj8/cMWGBBCsjw= +pgregory.net/rapid v1.1.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/internal/languages/enry_vendored.go b/internal/languages/enry_vendored.go new file mode 100644 index 00000000..f85bb45f --- /dev/null +++ b/internal/languages/enry_vendored.go @@ -0,0 +1,15 @@ +package languages + +import "strings" + +// This file contains functions private functions +// vendored from the go-enry codebase. + +// convertToAliasKey is vendored from go-enry to make sure +// we're normalizing strings the same way. +func convertToAliasKey(langName string) string { + ak := strings.SplitN(langName, `,`, 2)[0] + ak = strings.Replace(ak, ` `, `_`, -1) + ak = strings.ToLower(ak) + return ak +} diff --git a/internal/languages/extensions.go b/internal/languages/extensions.go new file mode 100644 index 00000000..bd143a51 --- /dev/null +++ b/internal/languages/extensions.go @@ -0,0 +1,457 @@ +package languages + +import ( + "path/filepath" + "slices" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - Only this package can use enry +) + +// GetLanguageByNameOrAlias returns the standardized name for +// a language based on its name (in which case this is an identity operation) +// or based on its alias, which is potentially an alternate name for +// the language. +// +// Aliases are fully lowercase, and map N-1 to languages. +// +// For example, +// +// GetLanguageByNameOrAlias("ada") == "Ada", true +// GetLanguageByNameOrAlias("ada95") == "Ada", true +// +// Historical note: This function was added for replacing usages of +// enry.GetLanguageByAlias, which, unlike the name suggests, also +// handles non-normalized names such as those with spaces. +func GetLanguageByNameOrAlias(nameOrAlias string) (lang string, ok bool) { + alias := convertToAliasKey(nameOrAlias) + if lang, ok = unsupportedByEnryAliasMap[alias]; ok { + return lang, true + } + + return enry.GetLanguageByAlias(alias) +} + +// GetLanguageExtensions returns the list of file extensions for a given +// language. Returned extensions are always prefixed with a '.'. +// +// The returned slice will be empty iff the language is not known. +// +// Handles more languages than enry.GetLanguageExtensions. +// +// Mutually consistent with getLanguagesByExtension, see the tests +// for the exact invariants. +func GetLanguageExtensions(language string) []string { + if langs, ok := unsupportedByEnryNameToExtensionMap[language]; ok { + return langs + } + + ignoreExts, isNiche := nicheExtensionUsages[language] + // Force a copy to avoid accidentally modifying the global variable + enryExts := slices.Clone(enry.GetLanguageExtensions(language)) + if !isNiche { + return slices.Clone(enryExts) + } + return slices.DeleteFunc(enryExts, func(ext string) bool { + _, shouldIgnore := ignoreExts[ext] + return shouldIgnore + }) +} + +// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension +// to work around the following limitations: +// - For some extensions which are overwhelmingly used by a certain file type +// in practice, such as '.ts', '.md' and '.yaml', it returns ambiguous results. +// - It does not provide any information about binary files. +// - Some languages are not supported by enry yet (e.g. Magik) +func getLanguagesByExtension(path string) (candidates []string, isLikelyBinaryFile bool) { + ext := filepath.Ext(path) + if ext == "" { + return nil, false + } + if lang, ok := unsupportedByEnryExtensionToNameMap[ext]; ok { + return []string{lang}, false + } + if _, ok := commonBinaryFileExtensions[ext[1:]]; ok { + return nil, true + } + if lang, ok := overrideAmbiguousExtensionsMap[ext]; ok { + return []string{lang}, false + } + return enry.GetLanguagesByExtension(path, nil, nil), false +} + +var commonBinaryFileExtensions = func() map[string]struct{} { + m := map[string]struct{}{} + for _, s := range commonBinaryFileExtensionsList { + m[s] = struct{}{} + } + return m +}() + +var overrideAmbiguousExtensionsMap = map[string]string{ + // Ignoring the uncommon usage of '.cs' for Smalltalk. + ".cs": "C#", + // The other languages are Filterscript, Forth, GLSL. Out of that, + // Forth and GLSL commonly use other extensions. Ignore Filterscript + // as it is niche. + ".fs": "F#", + // Ignoring other variants of JSON, such as OASv2-json and OASv3-json + ".json": "JSON", + // Not considering "GCC Machine Description". + ".md": "Markdown", + // The other main language using '.rs' is RenderScript, but that's deprecated. + // See https://developer.android.com/guide/topics/renderscript/compute + ".rs": "Rust", + // In i18n contexts, there are XML files with '.ts' and '.tsx' extensions, + // but we ignore those for now to avoid penalizing the common case. + ".tsx": "TSX", + ".ts": "TypeScript", + // Ignoring "Adblock Filter List" and "Vim Help File". + ".txt": "Text", + // Ignoring other variants of YAML, such as MiniYAML, OASv2-yaml, OASv3-yaml. + ".yaml": "YAML", + ".yml": "YAML", + // The PR adding Pkl support also listed another language called Pickle in + // its heuristics, but doesn't have any real support for it. Just ignore + // it. + // https://github.com/github-linguist/linguist/pull/6730/files#diff-c2d2d7946540ab501a5ef7a7f54a57c530d8da599e41c2beb0fd2f5635d2fd50R539 + ".pkl": "Pkl", +} + +var unsupportedByEnryExtensionToNameMap = map[string]string{ + // Extensions for the Apex programming language + // See https://developer.salesforce.com/docs/atlas.en-us.apexcode.meta/apexcode/apex_dev_guide.htm + ".apex": "Apex", + ".apxt": "Apex", + ".apxc": "Apex", + ".cls": "Apex", + ".trigger": "Apex", + ".magik": "Magik", +} + +// nicheExtensionUsage keeps track of which (lang, extension) mappings +// should not be considered. +// +// We cannot wholesale ignore these languages, as this list includes +// languages like XML, but it can contain unusual extensions like '.tsx' +// which we generally want to classify as TypeScript. +var nicheExtensionUsages = func() map[string]map[string]struct{} { + niche := map[string]map[string]struct{}{} + considered := map[string]struct{}{} + for _, lang := range overrideAmbiguousExtensionsMap { + considered[lang] = struct{}{} + } + for ext := range overrideAmbiguousExtensionsMap { + langs := enry.GetLanguagesByExtension("foo"+ext, nil, nil) + for _, lang := range langs { + if _, found := considered[lang]; !found { + if m, hasMap := niche[lang]; hasMap { + m[ext] = struct{}{} + } else { + niche[lang] = map[string]struct{}{ext: {}} + } + } + } + } + for specialOverrideExt, lang := range unsupportedByEnryExtensionToNameMap { + considered[lang] = struct{}{} + langs := enry.GetLanguagesByExtension("foo"+specialOverrideExt, nil, nil) + for _, lang := range langs { + if _, found := considered[lang]; !found { + if m, hasMap := niche[lang]; hasMap { + m[specialOverrideExt] = struct{}{} + } else { + niche[lang] = map[string]struct{}{specialOverrideExt: {}} + } + } + } + } + return niche +}() + +var unsupportedByEnryNameToExtensionMap = reverseMap(unsupportedByEnryExtensionToNameMap) + +// unsupportedByEnryAliasMap maps alias -> language name for languages +// not tracked by go-enry. +var unsupportedByEnryAliasMap = func() map[string]string { + out := map[string]string{} + for _, lang := range unsupportedByEnryExtensionToNameMap { + out[convertToAliasKey(lang)] = lang + } + return out +}() + +func reverseMap(m map[string]string) map[string][]string { + n := make(map[string][]string, len(m)) + for k, v := range m { + n[v] = append(n[v], k) + } + return n +} + +// Source: https://github.com/sindresorhus/binary-extensions/blob/main/binary-extensions.json +// License: https://github.com/sindresorhus/binary-extensions/blob/main/license +// Replace the contents with +// curl -L https://raw.githubusercontent.com/sindresorhus/binary-extensions/main/binary-extensions.json | jq '.[]' | awk '{print $1 ","}' +// +// Not adding a leading '.' here to make it easier to update/compare the list. +var commonBinaryFileExtensionsList = []string{ + "3dm", + "3ds", + "3g2", + "3gp", + "7z", + "a", + "aac", + "adp", + "ai", + "aif", + "aiff", + "alz", + "ape", + "apk", + "appimage", + "ar", + "arj", + "asf", + "au", + "avi", + "bak", + "baml", + "bh", + "bin", + "bk", + "bmp", + "btif", + "bz2", + "bzip2", + "cab", + "caf", + "cgm", + "class", + "cmx", + "cpio", + "cr2", + "cur", + "dat", + "dcm", + "deb", + "dex", + "djvu", + "dll", + "dmg", + "dng", + "doc", + "docm", + "docx", + "dot", + "dotm", + "dra", + "DS_Store", + "dsk", + "dts", + "dtshd", + "dvb", + "dwg", + "dxf", + "ecelp4800", + "ecelp7470", + "ecelp9600", + "egg", + "eol", + "eot", + "epub", + "exe", + "f4v", + "fbs", + "fh", + "fla", + "flac", + "flatpak", + "fli", + "flv", + "fpx", + "fst", + "fvt", + "g3", + "gh", + "gif", + "graffle", + "gz", + "gzip", + "h261", + "h263", + "h264", + "icns", + "ico", + "ief", + "img", + "ipa", + "iso", + "jar", + "jpeg", + "jpg", + "jpgv", + "jpm", + "jxr", + "key", + "ktx", + "lha", + "lib", + "lvp", + "lz", + "lzh", + "lzma", + "lzo", + "m3u", + "m4a", + "m4v", + "mar", + "mdi", + "mht", + "mid", + "midi", + "mj2", + "mka", + "mkv", + "mmr", + "mng", + "mobi", + "mov", + "movie", + "mp3", + "mp4", + "mp4a", + "mpeg", + "mpg", + "mpga", + "mxu", + "nef", + "npx", + "numbers", + "nupkg", + "o", + "odp", + "ods", + "odt", + "oga", + "ogg", + "ogv", + "otf", + "ott", + "pages", + "pbm", + "pcx", + "pdb", + "pdf", + "pea", + "pgm", + "pic", + "png", + "pnm", + "pot", + "potm", + "potx", + "ppa", + "ppam", + "ppm", + "pps", + "ppsm", + "ppsx", + "ppt", + "pptm", + "pptx", + "psd", + "pya", + "pyc", + "pyo", + "pyv", + "qt", + "rar", + "ras", + "raw", + "resources", + "rgb", + "rip", + "rlc", + "rmf", + "rmvb", + "rpm", + "rtf", + "rz", + "s3m", + "s7z", + "scpt", + "sgi", + "shar", + "snap", + "sil", + "sketch", + "slk", + "smv", + "snk", + "so", + "stl", + "suo", + "sub", + "swf", + "tar", + "tbz", + "tbz2", + "tga", + "tgz", + "thmx", + "tif", + "tiff", + "tlz", + "ttc", + "ttf", + "txz", + "udf", + "uvh", + "uvi", + "uvm", + "uvp", + "uvs", + "uvu", + "viv", + "vob", + "war", + "wav", + "wax", + "wbmp", + "wdp", + "weba", + "webm", + "webp", + "whl", + "wim", + "wm", + "wma", + "wmv", + "wmx", + "woff", + "woff2", + "wrm", + "wvx", + "xbm", + "xif", + "xla", + "xlam", + "xls", + "xlsb", + "xlsm", + "xlsx", + "xlt", + "xltm", + "xltx", + "xm", + "xmind", + "xpi", + "xpm", + "xwd", + "xz", + "z", + "zip", + "zipx", +} diff --git a/internal/languages/extensions_test.go b/internal/languages/extensions_test.go new file mode 100644 index 00000000..86fe3158 --- /dev/null +++ b/internal/languages/extensions_test.go @@ -0,0 +1,204 @@ +package languages + +import ( + "slices" + "strings" + "testing" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry + enrydata "github.com/go-enry/go-enry/v2/data" //nolint:depguard - This package is allowed to use enry + "github.com/stretchr/testify/require" +) + +// Languages/extensions that we don't want to regress +var nonAmbiguousExtensionsCheck = map[string]string{ + ".apex": "Apex", + ".apxt": "Apex", + ".apxc": "Apex", + ".cls": "Apex", + ".trigger": "Apex", + ".js": "JavaScript", + // Linguist removed JSX (but not TSX) as a separate language: + // https://github.com/github-linguist/linguist/pull/5133 + ".jsx": "JavaScript", + ".ts": "TypeScript", + ".tsx": "TSX", + ".py": "Python", + ".rb": "Ruby", + ".go": "Go", + ".java": "Java", + ".kt": "Kotlin", + ".magik": "Magik", + ".scala": "Scala", + ".cs": "C#", + ".fs": "F#", + ".rs": "Rust", + ".c": "C", + ".cpp": "C++", + ".cxx": "C++", + ".hpp": "C++", + ".hxx": "C++", + ".lua": "Lua", + ".dart": "Dart", + ".swift": "Swift", + ".css": "CSS", + ".json": "JSON", + ".yml": "YAML", + ".xml": "XML", + ".pkl": "Pkl", +} + +func TestGetLanguageByAlias_UnsupportedLanguages(t *testing.T) { + for alias, name := range unsupportedByEnryAliasMap { + resName, _ := GetLanguageByNameOrAlias(alias) + require.Equal(t, name, resName, + "maybe a typo in `unsupportedByEnryAliasMap`?") + } +} + +func TestGetLanguageByAlias_NonAmbiguousLanguages(t *testing.T) { + for _, language := range nonAmbiguousExtensionsCheck { + _, ok := GetLanguageByNameOrAlias(language) + require.True(t, ok, + "unable to find language %s in go-enry", language) + } +} + +func TestGetLanguageExtensions_UnsupportedExtensions(t *testing.T) { + for language, exts := range unsupportedByEnryNameToExtensionMap { + extensions := GetLanguageExtensions(language) + for _, ext := range exts { + require.Contains(t, extensions, ext, + "maybe a typo in `unsupportedByEnryNameToExtensionMap`?") + } + } +} + +func TestGetLanguageExtensions_NonAmbiguousExtensions(t *testing.T) { + langMap := reverseMap(nonAmbiguousExtensionsCheck) + for language, exts := range langMap { + extensions := GetLanguageExtensions(language) + for _, ext := range exts { + require.Contains(t, extensions, ext, + "If this test fails when updating enry, maybe `overrideAmbiguousExtensionsMap` needs updating") + } + } +} + +func TestGetLanguagesByExtension_UnsupportedExtensions(t *testing.T) { + for ext, language := range unsupportedByEnryExtensionToNameMap { + filename := "foo" + ext + languages, _ := getLanguagesByExtension(filename) + require.Contains(t, languages, language, + "maybe a typo in `unsupportedByEnryExtensionToNameMap`?") + } +} + +func TestGetLanguagesByExtension_OverrideExtensions(t *testing.T) { + for ext, language := range overrideAmbiguousExtensionsMap { + filename := "foo" + ext + enryLangs := enry.GetLanguagesByExtension(filename, nil, nil) + require.Contains(t, enryLangs, language, + "maybe a typo in `overrideAmbiguousExtensionsMap`?") + require.Greaterf(t, len(enryLangs), 1, + "extension %v is not ambiguous according to enry, remove it from `overrideAmbiguousExtensionsMap`", + ext) + } +} + +func TestGetLanguagesByExtension_NonAmbiguousExtensions(t *testing.T) { + for ext, language := range nonAmbiguousExtensionsCheck { + filename := "foo" + ext + languages, isLikelyBinaryFile := getLanguagesByExtension(filename) + require.False(t, isLikelyBinaryFile) + require.Equal(t, []string{language}, languages, + "If this test fails when updating enry, maybe `overrideAmbiguousExtensionsMap` needs updating") + } +} + +func TestGetLanguagesByExtension_BinaryExtensions(t *testing.T) { + for _, ext := range []string{".png", ".jpg", ".gif"} { + filename := "foo" + ext + _, isLikelyBinary := getLanguagesByExtension(filename) + require.Truef(t, isLikelyBinary, "filename: %v was not guessed to be binary;"+ + "bug in extension matching logic in getLanguagesByExtension maybe?", + filename) + } +} + +func TestExtensionsConsistency(t *testing.T) { + for ext, overrideLang := range overrideAmbiguousExtensionsMap { + filepath := "foo" + ext + enryLangsForExt := enry.GetLanguagesByExtension(filepath, nil, nil) + require.Containsf(t, enryLangsForExt, overrideLang, "overrideAmbiguousExtensionsMap maps extension %q to language %q but "+ + "that mapping is not present in enry's list %v", ext, overrideLang, enryLangsForExt) + require.Greaterf(t, len(enryLangsForExt), 1, "overrideAmbiguousExtensionsMap states that"+ + "%q extension is ambiguous, but only found langs: %v", ext, enryLangsForExt) + + candidates, isLikelyBinary := getLanguagesByExtension(filepath) + require.False(t, isLikelyBinary, "ambiguous files are all source code") + require.True(t, len(candidates) == 1, "getLanguagesByExtension should respect overrideAmbiguousExtensionsMap") + + shouldBeIgnoredLangsForExt := slices.DeleteFunc(enryLangsForExt, func(s string) bool { + return s == overrideLang + }) + for _, shouldBeIgnoredLang := range shouldBeIgnoredLangsForExt { + ignoredExts, found := nicheExtensionUsages[shouldBeIgnoredLang] + require.Truef(t, found, "expected lang: %q to have an entry in nicheExtensionUsages for consistency with GetLanguagesByExtension", shouldBeIgnoredLang) + require.Truef(t, len(ignoredExts) >= 1, "sets in nicheExtensionUsages must be non-empty") + + nonNicheExts := GetLanguageExtensions(shouldBeIgnoredLang) + for ignoredExt := range ignoredExts { + require.Falsef(t, slices.Contains(nonNicheExts, ignoredExt), + "GetLanguageExtensions should not return %q for lang %q for consistency with GetLanguagesByExtension", + ignoredExt, shouldBeIgnoredLang) + } + } + } +} + +func TestExtensionsConsistency2(t *testing.T) { + for lang := range enrydata.ExtensionsByLanguage { + for _, ext := range GetLanguageExtensions(lang) { + if strings.Count(ext, ".") > 1 { + // Ignore unusual edge cases like .coffee.md for Literate CoffeeScript + continue + } + langsByExt, isLikelyBinary := getLanguagesByExtension("foo" + ext) + if !isLikelyBinary { + require.Truef(t, slices.Contains(langsByExt, lang), + "expected getLanguagesByExtension result %v to contain %q (extension: %q)", langsByExt, lang, ext) + } + } + } +} + +func TestUnsupportedByEnry(t *testing.T) { + for lang := range unsupportedByEnryNameToExtensionMap { + enry_extensions, found := enrydata.ExtensionsByLanguage[lang] + if found { + validateLanguageAgainstGoEnry(t, "unsupportedByEnryNameToExtensionMap", enry_extensions, lang) + } + } + for _, lang := range unsupportedByEnryAliasMap { + enry_extensions, found := enrydata.ExtensionsByLanguage[lang] + if found { + validateLanguageAgainstGoEnry(t, "unsupportedByEnryAliasMap", enry_extensions, lang) + } + } + for _, lang := range unsupportedByEnryExtensionToNameMap { + enry_extensions, found := enrydata.ExtensionsByLanguage[lang] + if found { + validateLanguageAgainstGoEnry(t, "unsupportedByEnryExtensionToNameMap", enry_extensions, lang) + } + } +} + +func validateLanguageAgainstGoEnry(t *testing.T, name string, enryExtensions []string, lang string) { + enryExtensions = slices.Clone(enryExtensions) + slices.Sort(enryExtensions) + sgExtensions := slices.Clone(unsupportedByEnryNameToExtensionMap[lang]) + slices.Sort(sgExtensions) + + require.NotEqualf(t, enryExtensions, sgExtensions, "looks like language %q is supported by enry with the same extensions; remove it from %q", lang, name) +} diff --git a/internal/languages/language.go b/internal/languages/language.go deleted file mode 100644 index ec76d945..00000000 --- a/internal/languages/language.go +++ /dev/null @@ -1,74 +0,0 @@ -// This file wraps the logic of go-enry (https://github.com/go-enry/go-enry) to support additional languages. -// go-enry is based off of a package called Linguist (https://github.com/github/linguist) -// and sometimes programming languages may not be supported by Linguist -// or may take a while to get merged in and make it into go-enry. This wrapper -// gives us flexibility to support languages in those cases. We list additional languages -// in this file and remove them once they make it into Linguist and go-enry. -// This logic is similar to what we have in the sourcegraph/sourcegraph repo, in the future -// we plan to refactor both into a common library to share between the two repos. -package languages - -import ( - "path/filepath" - "strings" - - "github.com/go-enry/go-enry/v2" -) - -var unsupportedByLinguistAliasMap = map[string]string{ - // Extensions for the Apex programming language - // See https://developer.salesforce.com/docs/atlas.en-us.apexcode.meta/apexcode/apex_dev_guide.htm - "apex": "Apex", - // Pkl Configuration Language (https://pkl-lang.org/) - // Add to linguist on 6/7/24 - // can remove once go-enry package updates - // to that linguist version - "pkl": "Pkl", - // Magik Language - "magik": "Magik", -} - -var unsupportedByLinguistExtensionToNameMap = map[string]string{ - ".apex": "Apex", - ".apxt": "Apex", - ".apxc": "Apex", - ".cls": "Apex", - ".trigger": "Apex", - // Pkl Configuration Language (https://pkl-lang.org/) - ".pkl": "Pkl", - // Magik Language - ".magik": "Magik", -} - -// getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias -// It supports languages that are missing in linguist -func GetLanguageByAlias(alias string) (language string, ok bool) { - language, ok = enry.GetLanguageByAlias(alias) - if !ok { - normalizedAlias := strings.ToLower(alias) - language, ok = unsupportedByLinguistAliasMap[normalizedAlias] - } - - return -} - -// GetLanguage is a replacement for enry.GetLanguage -// to find out the most probable language to return but includes support -// for languages missing from linguist -func GetLanguage(filename string, content []byte) (language string) { - language = enry.GetLanguage(filename, content) - - // If go-enry failed to find language, fall back on our - // internal check for languages missing in linguist - if language == "" { - ext := filepath.Ext(filename) - normalizedExt := strings.ToLower(ext) - if ext == "" { - return - } - if lang, ok := unsupportedByLinguistExtensionToNameMap[normalizedExt]; ok { - language = lang - } - } - return -} diff --git a/internal/languages/language_test.go b/internal/languages/language_test.go deleted file mode 100644 index 25e2382a..00000000 --- a/internal/languages/language_test.go +++ /dev/null @@ -1,107 +0,0 @@ -package languages - -import "testing" - -func TestGetLanguageByAlias(t *testing.T) { - tests := []struct { - name string - alias string - want string - wantOk bool - }{ - { - name: "empty alias", - alias: "", - want: "", - wantOk: false, - }, - { - name: "unknown alias", - alias: "unknown", - want: "", - wantOk: false, - }, - { - name: "supported alias", - alias: "go", - want: "Go", - wantOk: true, - }, - { - name: "unsupported by linguist alias", - alias: "magik", - want: "Magik", - wantOk: true, - }, - { - name: "unsupported by linguist alias normalized", - alias: "mAgIk", - want: "Magik", - wantOk: true, - }, - { - name: "apex example unsupported by linguist alias", - alias: "apex", - want: "Apex", - wantOk: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, ok := GetLanguageByAlias(tt.alias) - if got != tt.want || ok != tt.wantOk { - t.Errorf("GetLanguageByAlias(%q) = %q, %t, want %q, %t", tt.alias, got, ok, tt.want, tt.wantOk) - } - }) - } -} - -func TestGetLanguage(t *testing.T) { - tests := []struct { - name string - filename string - content []byte - want string - }{ - { - name: "empty filename", - filename: "", - content: []byte(""), - want: "", - }, - { - name: "unknown extension", - filename: "file.unknown", - content: []byte(""), - want: "", - }, - { - name: "supported extension", - filename: "file.go", - content: []byte("package main"), - want: "Go", - }, - { - name: "magik: unsupported by linguist extension", - filename: "file.magik", - content: []byte(""), - want: "Magik", - }, - { - name: "apex: unsupported by linguist extension", - filename: "file.apxc", - content: []byte(""), - want: "Apex", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := GetLanguage(tt.filename, tt.content) - if got != tt.want { - t.Errorf("GetLanguage(%q, %q) = %q, want %q", tt.filename, tt.content, got, tt.want) - } - }) - } -} diff --git a/internal/languages/languages.go b/internal/languages/languages.go new file mode 100644 index 00000000..73854ebc --- /dev/null +++ b/internal/languages/languages.go @@ -0,0 +1,134 @@ +package languages + +import ( + "slices" + "strings" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry. +) + +// Make sure all names are lowercase here, since they are normalized +var enryLanguageMappings = map[string]string{ + "c#": "c_sharp", +} + +func NormalizeLanguage(filetype string) string { + normalized := strings.ToLower(filetype) + if mapped, ok := enryLanguageMappings[normalized]; ok { + normalized = mapped + } + + return normalized +} + +// GetMostLikelyLanguage returns the language for the given path and contents. +// +// Prefer using GetLanguages instead of this function. +// +// TODO: Remove the extra normalization this functiond does over GetLanguages +func GetMostLikelyLanguage(path, contents string) (lang string, found bool) { + languages, _ := GetLanguages(path, func() ([]byte, error) { + if len(contents) > 2048 { + return []byte(contents[:2048]), nil + } + return []byte(contents), nil + }) + for _, lang := range languages { + if lang != "" { + return NormalizeLanguage(lang), true + } + } + return "", false +} + +// GetLanguages is a replacement for enry.GetLanguages which +// avoids incorrect fallback behavior that is present in DefaultStrategies, +// where it will misclassify '.h' header files as C when file contents +// are not available. +// +// The content can be optionally passed via a callback instead of +// directly, so that in the common case, the caller can avoid fetching +// the content. The full content returned by getContent will be used for +// language detection. +// +// getContent is not called if the file is likely to be a binary file, +// as enry only covers programming languages. +// +// Returns: +// - An error if the getContent func returns an error +// - An empty slice if language detection failed +// - A single-element slice if the language was determined exactly +// - A multi-element slice if the language was ambiguous. For example, +// for simple `.h` files with just comments and macros, they may +// be valid C, C++ or any of their derivative languages (e.g. Objective-C). +func GetLanguages(path string, getContent func() ([]byte, error)) ([]string, error) { + impl := func() ([]string, error) { + langs := enry.GetLanguagesByFilename(path, nil, nil) + if len(langs) == 1 { + return langs, nil + } + newLangs, isLikelyBinaryFile := getLanguagesByExtension(path) + if isLikelyBinaryFile { + return nil, nil + } + switch len(newLangs) { + case 0: + break + case 1: + return newLangs, nil + default: + langs = newLangs + } + if getContent == nil { + return langs, nil + } + content, err := getContent() + if err != nil { + return nil, err + } + if len(content) == 0 { + return langs, nil + } + if enry.IsBinary(content) { + return nil, nil + } + + // enry doesn't expose a way to call GetLanguages with a specific set of + // strategies, so just hand-roll that code here. + var languages = langs + for _, strategy := range []enry.Strategy{enry.GetLanguagesByModeline, getLanguagesByShebang, enry.GetLanguagesByContent, enry.GetLanguagesByClassifier} { + candidates := strategy(path, content, languages) + switch len(candidates) { + case 0: + continue + case 1: + return candidates, nil + default: + languages = candidates + } + } + + return languages, nil + } + + langs, err := impl() + return slices.Clone(langs), err +} + +// getLanguagesByShebang is a replacement for enry.GetLanguagesByShebang. +// +// The enry function considers non-programming languages such as 'Pod'/'Pod 6' +// also for shebangs, so work around that. +func getLanguagesByShebang(path string, content []byte, candidates []string) []string { + languages := enry.GetLanguagesByShebang(path, content, candidates) + if len(languages) == 2 { + // See https://sourcegraph.com/github.com/go-enry/go-enry@40f2a1e5b90eec55c20441c2a5911dcfc298a447/-/blob/data/interpreter.go?L95-96 + if slices.Equal(languages, []string{"Perl", "Pod"}) { + return []string{"Perl"} + } + if slices.Equal(languages, []string{"Pod 6", "Raku"}) { + return []string{"Raku"} + } + } + return slices.Clone(languages) +} diff --git a/internal/languages/languages_test.go b/internal/languages/languages_test.go new file mode 100644 index 00000000..def950d6 --- /dev/null +++ b/internal/languages/languages_test.go @@ -0,0 +1,94 @@ +package languages + +import ( + "testing" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry + "github.com/stretchr/testify/require" + "pgregory.net/rapid" +) + +func TestGetLanguages(t *testing.T) { + const matlabContent = "function [out] = square(x)\nout = x * x;\nend" + const mathematicaContent = "f[x_] := x ^ 2\ng[y_] := f[y]" + const cppContent = "namespace x { }" + const cContent = "typedef struct { int x; } Int;" + const emptyContent = "" + + testCases := []struct { + path string + content string + expectedLanguages []string + compareFirstOnly bool + }{ + {path: "perlscript", content: "#!/usr/bin/env perl\n$version = $ARGV[0];", expectedLanguages: []string{"Perl"}}, + {path: "rakuscript", content: "#!/usr/bin/env perl6\n$version = $ARGV[0];", expectedLanguages: []string{"Raku"}}, + {path: "ambiguous.h", content: emptyContent, expectedLanguages: []string{"C", "C++", "Objective-C"}}, + {path: "cpp.h", content: cppContent, expectedLanguages: []string{"C++"}}, + {path: "c.h", content: cContent, expectedLanguages: []string{"C"}}, + {path: "matlab.m", content: matlabContent, expectedLanguages: []string{"MATLAB"}, compareFirstOnly: true}, + {path: "mathematica.m", content: mathematicaContent, expectedLanguages: []string{"Mathematica"}, compareFirstOnly: true}, + { + path: "mathematica2.m", + content: ` +s := StringRiffle[{"a", "b", "c", "d", "e"}, ", "] +Flatten[{{a, b}, {c, {d}, e}, {f, {g, h}}}] +square[x_] := x ^ 2 +fourthpower[x_] := square[square[x]] +`, + expectedLanguages: []string{"Mathematica"}, + compareFirstOnly: true, + }, + } + + for _, testCase := range testCases { + var getContent func() ([]byte, error) + if testCase.content != "" { + getContent = func() ([]byte, error) { return []byte(testCase.content), nil } + } + gotLanguages, err := GetLanguages(testCase.path, getContent) + require.NoError(t, err) + if testCase.compareFirstOnly { + require.Equal(t, testCase.expectedLanguages, gotLanguages[0:1]) + continue + } + require.Equal(t, testCase.expectedLanguages, gotLanguages) + } + + rapid.Check(t, func(t *rapid.T) { + path := rapid.String().Draw(t, "path") + content := rapid.SliceOfN(rapid.Byte(), 0, 100).Draw(t, "contents") + require.NotPanics(t, func() { + langs, err := GetLanguages(path, func() ([]byte, error) { return content, nil }) + require.NoError(t, err) + if len(langs) != 0 { + for _, l := range langs { + require.NotEqual(t, enry.OtherLanguage, l) + } + } + }) + }) + + rapid.Check(t, func(t *rapid.T) { + baseName := "abcd" + exts := []string{".h", ".m", ".unknown", ""} + extGens := []*rapid.Generator[string]{} + for _, ext := range exts { + extGens = append(extGens, rapid.Just(ext)) + } + extension := rapid.OneOf(extGens...).Draw(t, "extension") + path := baseName + extension + contentGens := []*rapid.Generator[string]{} + for _, content := range []string{cContent, cppContent, mathematicaContent, matlabContent, emptyContent} { + contentGens = append(contentGens, rapid.Just(content)) + } + content := rapid.OneOf(contentGens...).Draw(t, "content") + langs, err := GetLanguages(path, func() ([]byte, error) { + return []byte(content), nil + }) + require.NoError(t, err) + for _, lang := range langs { + require.NotEqual(t, enry.OtherLanguage, lang) + } + }) +} diff --git a/query/parse.go b/query/parse.go index d8762f19..e5079bb3 100644 --- a/query/parse.go +++ b/query/parse.go @@ -172,7 +172,7 @@ func parseExpr(in []byte) (Q, int, error) { } expr = q case tokLang: - canonical, ok := languages.GetLanguageByAlias(text) + canonical, ok := languages.GetLanguageByNameOrAlias(text) if !ok { expr = &Const{false} } else {