From 78a7ea4769f8b6943f6fa96c484ebe2cca421a7f Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 17 Sep 2024 12:18:57 +0200 Subject: [PATCH] languages: copy over latest version from sourcegraph I realised we haven't been updating this package as we updated the package in the sourcegraph repo. We don't need all the functionality it has, but its easier to just copy paste everything. Test Plan: go test --- go.mod | 1 + go.sum | 2 + internal/languages/enry_vendored.go | 15 + internal/languages/extensions.go | 457 ++++++++++++++++++++++++++ internal/languages/extensions_test.go | 204 ++++++++++++ internal/languages/language.go | 74 ----- internal/languages/language_test.go | 107 ------ internal/languages/languages.go | 134 ++++++++ internal/languages/languages_test.go | 94 ++++++ query/parse.go | 2 +- 10 files changed, 908 insertions(+), 182 deletions(-) create mode 100644 internal/languages/enry_vendored.go create mode 100644 internal/languages/extensions.go create mode 100644 internal/languages/extensions_test.go delete mode 100644 internal/languages/language.go delete mode 100644 internal/languages/language_test.go create mode 100644 internal/languages/languages.go create mode 100644 internal/languages/languages_test.go diff --git a/go.mod b/go.mod index d3514b95..efc0025d 100644 --- a/go.mod +++ b/go.mod @@ -53,6 +53,7 @@ require ( golang.org/x/sys v0.25.0 google.golang.org/grpc v1.66.1 google.golang.org/protobuf v1.34.2 + pgregory.net/rapid v1.1.0 ) require ( diff --git a/go.sum b/go.sum index bfae941a..b2e6db8b 100644 --- a/go.sum +++ b/go.sum @@ -572,4 +572,6 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= mvdan.cc/gofumpt v0.4.0 h1:JVf4NN1mIpHogBj7ABpgOyZc65/UUOkKQFkoURsz4MM= mvdan.cc/gofumpt v0.4.0/go.mod h1:PljLOHDeZqgS8opHRKLzp2It2VBuSdteAgqUfzMTxlQ= +pgregory.net/rapid v1.1.0 h1:CMa0sjHSru3puNx+J0MIAuiiEV4N0qj8/cMWGBBCsjw= +pgregory.net/rapid v1.1.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/internal/languages/enry_vendored.go b/internal/languages/enry_vendored.go new file mode 100644 index 00000000..f85bb45f --- /dev/null +++ b/internal/languages/enry_vendored.go @@ -0,0 +1,15 @@ +package languages + +import "strings" + +// This file contains functions private functions +// vendored from the go-enry codebase. + +// convertToAliasKey is vendored from go-enry to make sure +// we're normalizing strings the same way. +func convertToAliasKey(langName string) string { + ak := strings.SplitN(langName, `,`, 2)[0] + ak = strings.Replace(ak, ` `, `_`, -1) + ak = strings.ToLower(ak) + return ak +} diff --git a/internal/languages/extensions.go b/internal/languages/extensions.go new file mode 100644 index 00000000..bd143a51 --- /dev/null +++ b/internal/languages/extensions.go @@ -0,0 +1,457 @@ +package languages + +import ( + "path/filepath" + "slices" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - Only this package can use enry +) + +// GetLanguageByNameOrAlias returns the standardized name for +// a language based on its name (in which case this is an identity operation) +// or based on its alias, which is potentially an alternate name for +// the language. +// +// Aliases are fully lowercase, and map N-1 to languages. +// +// For example, +// +// GetLanguageByNameOrAlias("ada") == "Ada", true +// GetLanguageByNameOrAlias("ada95") == "Ada", true +// +// Historical note: This function was added for replacing usages of +// enry.GetLanguageByAlias, which, unlike the name suggests, also +// handles non-normalized names such as those with spaces. +func GetLanguageByNameOrAlias(nameOrAlias string) (lang string, ok bool) { + alias := convertToAliasKey(nameOrAlias) + if lang, ok = unsupportedByEnryAliasMap[alias]; ok { + return lang, true + } + + return enry.GetLanguageByAlias(alias) +} + +// GetLanguageExtensions returns the list of file extensions for a given +// language. Returned extensions are always prefixed with a '.'. +// +// The returned slice will be empty iff the language is not known. +// +// Handles more languages than enry.GetLanguageExtensions. +// +// Mutually consistent with getLanguagesByExtension, see the tests +// for the exact invariants. +func GetLanguageExtensions(language string) []string { + if langs, ok := unsupportedByEnryNameToExtensionMap[language]; ok { + return langs + } + + ignoreExts, isNiche := nicheExtensionUsages[language] + // Force a copy to avoid accidentally modifying the global variable + enryExts := slices.Clone(enry.GetLanguageExtensions(language)) + if !isNiche { + return slices.Clone(enryExts) + } + return slices.DeleteFunc(enryExts, func(ext string) bool { + _, shouldIgnore := ignoreExts[ext] + return shouldIgnore + }) +} + +// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension +// to work around the following limitations: +// - For some extensions which are overwhelmingly used by a certain file type +// in practice, such as '.ts', '.md' and '.yaml', it returns ambiguous results. +// - It does not provide any information about binary files. +// - Some languages are not supported by enry yet (e.g. Magik) +func getLanguagesByExtension(path string) (candidates []string, isLikelyBinaryFile bool) { + ext := filepath.Ext(path) + if ext == "" { + return nil, false + } + if lang, ok := unsupportedByEnryExtensionToNameMap[ext]; ok { + return []string{lang}, false + } + if _, ok := commonBinaryFileExtensions[ext[1:]]; ok { + return nil, true + } + if lang, ok := overrideAmbiguousExtensionsMap[ext]; ok { + return []string{lang}, false + } + return enry.GetLanguagesByExtension(path, nil, nil), false +} + +var commonBinaryFileExtensions = func() map[string]struct{} { + m := map[string]struct{}{} + for _, s := range commonBinaryFileExtensionsList { + m[s] = struct{}{} + } + return m +}() + +var overrideAmbiguousExtensionsMap = map[string]string{ + // Ignoring the uncommon usage of '.cs' for Smalltalk. + ".cs": "C#", + // The other languages are Filterscript, Forth, GLSL. Out of that, + // Forth and GLSL commonly use other extensions. Ignore Filterscript + // as it is niche. + ".fs": "F#", + // Ignoring other variants of JSON, such as OASv2-json and OASv3-json + ".json": "JSON", + // Not considering "GCC Machine Description". + ".md": "Markdown", + // The other main language using '.rs' is RenderScript, but that's deprecated. + // See https://developer.android.com/guide/topics/renderscript/compute + ".rs": "Rust", + // In i18n contexts, there are XML files with '.ts' and '.tsx' extensions, + // but we ignore those for now to avoid penalizing the common case. + ".tsx": "TSX", + ".ts": "TypeScript", + // Ignoring "Adblock Filter List" and "Vim Help File". + ".txt": "Text", + // Ignoring other variants of YAML, such as MiniYAML, OASv2-yaml, OASv3-yaml. + ".yaml": "YAML", + ".yml": "YAML", + // The PR adding Pkl support also listed another language called Pickle in + // its heuristics, but doesn't have any real support for it. Just ignore + // it. + // https://github.com/github-linguist/linguist/pull/6730/files#diff-c2d2d7946540ab501a5ef7a7f54a57c530d8da599e41c2beb0fd2f5635d2fd50R539 + ".pkl": "Pkl", +} + +var unsupportedByEnryExtensionToNameMap = map[string]string{ + // Extensions for the Apex programming language + // See https://developer.salesforce.com/docs/atlas.en-us.apexcode.meta/apexcode/apex_dev_guide.htm + ".apex": "Apex", + ".apxt": "Apex", + ".apxc": "Apex", + ".cls": "Apex", + ".trigger": "Apex", + ".magik": "Magik", +} + +// nicheExtensionUsage keeps track of which (lang, extension) mappings +// should not be considered. +// +// We cannot wholesale ignore these languages, as this list includes +// languages like XML, but it can contain unusual extensions like '.tsx' +// which we generally want to classify as TypeScript. +var nicheExtensionUsages = func() map[string]map[string]struct{} { + niche := map[string]map[string]struct{}{} + considered := map[string]struct{}{} + for _, lang := range overrideAmbiguousExtensionsMap { + considered[lang] = struct{}{} + } + for ext := range overrideAmbiguousExtensionsMap { + langs := enry.GetLanguagesByExtension("foo"+ext, nil, nil) + for _, lang := range langs { + if _, found := considered[lang]; !found { + if m, hasMap := niche[lang]; hasMap { + m[ext] = struct{}{} + } else { + niche[lang] = map[string]struct{}{ext: {}} + } + } + } + } + for specialOverrideExt, lang := range unsupportedByEnryExtensionToNameMap { + considered[lang] = struct{}{} + langs := enry.GetLanguagesByExtension("foo"+specialOverrideExt, nil, nil) + for _, lang := range langs { + if _, found := considered[lang]; !found { + if m, hasMap := niche[lang]; hasMap { + m[specialOverrideExt] = struct{}{} + } else { + niche[lang] = map[string]struct{}{specialOverrideExt: {}} + } + } + } + } + return niche +}() + +var unsupportedByEnryNameToExtensionMap = reverseMap(unsupportedByEnryExtensionToNameMap) + +// unsupportedByEnryAliasMap maps alias -> language name for languages +// not tracked by go-enry. +var unsupportedByEnryAliasMap = func() map[string]string { + out := map[string]string{} + for _, lang := range unsupportedByEnryExtensionToNameMap { + out[convertToAliasKey(lang)] = lang + } + return out +}() + +func reverseMap(m map[string]string) map[string][]string { + n := make(map[string][]string, len(m)) + for k, v := range m { + n[v] = append(n[v], k) + } + return n +} + +// Source: https://github.com/sindresorhus/binary-extensions/blob/main/binary-extensions.json +// License: https://github.com/sindresorhus/binary-extensions/blob/main/license +// Replace the contents with +// curl -L https://raw.githubusercontent.com/sindresorhus/binary-extensions/main/binary-extensions.json | jq '.[]' | awk '{print $1 ","}' +// +// Not adding a leading '.' here to make it easier to update/compare the list. +var commonBinaryFileExtensionsList = []string{ + "3dm", + "3ds", + "3g2", + "3gp", + "7z", + "a", + "aac", + "adp", + "ai", + "aif", + "aiff", + "alz", + "ape", + "apk", + "appimage", + "ar", + "arj", + "asf", + "au", + "avi", + "bak", + "baml", + "bh", + "bin", + "bk", + "bmp", + "btif", + "bz2", + "bzip2", + "cab", + "caf", + "cgm", + "class", + "cmx", + "cpio", + "cr2", + "cur", + "dat", + "dcm", + "deb", + "dex", + "djvu", + "dll", + "dmg", + "dng", + "doc", + "docm", + "docx", + "dot", + "dotm", + "dra", + "DS_Store", + "dsk", + "dts", + "dtshd", + "dvb", + "dwg", + "dxf", + "ecelp4800", + "ecelp7470", + "ecelp9600", + "egg", + "eol", + "eot", + "epub", + "exe", + "f4v", + "fbs", + "fh", + "fla", + "flac", + "flatpak", + "fli", + "flv", + "fpx", + "fst", + "fvt", + "g3", + "gh", + "gif", + "graffle", + "gz", + "gzip", + "h261", + "h263", + "h264", + "icns", + "ico", + "ief", + "img", + "ipa", + "iso", + "jar", + "jpeg", + "jpg", + "jpgv", + "jpm", + "jxr", + "key", + "ktx", + "lha", + "lib", + "lvp", + "lz", + "lzh", + "lzma", + "lzo", + "m3u", + "m4a", + "m4v", + "mar", + "mdi", + "mht", + "mid", + "midi", + "mj2", + "mka", + "mkv", + "mmr", + "mng", + "mobi", + "mov", + "movie", + "mp3", + "mp4", + "mp4a", + "mpeg", + "mpg", + "mpga", + "mxu", + "nef", + "npx", + "numbers", + "nupkg", + "o", + "odp", + "ods", + "odt", + "oga", + "ogg", + "ogv", + "otf", + "ott", + "pages", + "pbm", + "pcx", + "pdb", + "pdf", + "pea", + "pgm", + "pic", + "png", + "pnm", + "pot", + "potm", + "potx", + "ppa", + "ppam", + "ppm", + "pps", + "ppsm", + "ppsx", + "ppt", + "pptm", + "pptx", + "psd", + "pya", + "pyc", + "pyo", + "pyv", + "qt", + "rar", + "ras", + "raw", + "resources", + "rgb", + "rip", + "rlc", + "rmf", + "rmvb", + "rpm", + "rtf", + "rz", + "s3m", + "s7z", + "scpt", + "sgi", + "shar", + "snap", + "sil", + "sketch", + "slk", + "smv", + "snk", + "so", + "stl", + "suo", + "sub", + "swf", + "tar", + "tbz", + "tbz2", + "tga", + "tgz", + "thmx", + "tif", + "tiff", + "tlz", + "ttc", + "ttf", + "txz", + "udf", + "uvh", + "uvi", + "uvm", + "uvp", + "uvs", + "uvu", + "viv", + "vob", + "war", + "wav", + "wax", + "wbmp", + "wdp", + "weba", + "webm", + "webp", + "whl", + "wim", + "wm", + "wma", + "wmv", + "wmx", + "woff", + "woff2", + "wrm", + "wvx", + "xbm", + "xif", + "xla", + "xlam", + "xls", + "xlsb", + "xlsm", + "xlsx", + "xlt", + "xltm", + "xltx", + "xm", + "xmind", + "xpi", + "xpm", + "xwd", + "xz", + "z", + "zip", + "zipx", +} diff --git a/internal/languages/extensions_test.go b/internal/languages/extensions_test.go new file mode 100644 index 00000000..86fe3158 --- /dev/null +++ b/internal/languages/extensions_test.go @@ -0,0 +1,204 @@ +package languages + +import ( + "slices" + "strings" + "testing" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry + enrydata "github.com/go-enry/go-enry/v2/data" //nolint:depguard - This package is allowed to use enry + "github.com/stretchr/testify/require" +) + +// Languages/extensions that we don't want to regress +var nonAmbiguousExtensionsCheck = map[string]string{ + ".apex": "Apex", + ".apxt": "Apex", + ".apxc": "Apex", + ".cls": "Apex", + ".trigger": "Apex", + ".js": "JavaScript", + // Linguist removed JSX (but not TSX) as a separate language: + // https://github.com/github-linguist/linguist/pull/5133 + ".jsx": "JavaScript", + ".ts": "TypeScript", + ".tsx": "TSX", + ".py": "Python", + ".rb": "Ruby", + ".go": "Go", + ".java": "Java", + ".kt": "Kotlin", + ".magik": "Magik", + ".scala": "Scala", + ".cs": "C#", + ".fs": "F#", + ".rs": "Rust", + ".c": "C", + ".cpp": "C++", + ".cxx": "C++", + ".hpp": "C++", + ".hxx": "C++", + ".lua": "Lua", + ".dart": "Dart", + ".swift": "Swift", + ".css": "CSS", + ".json": "JSON", + ".yml": "YAML", + ".xml": "XML", + ".pkl": "Pkl", +} + +func TestGetLanguageByAlias_UnsupportedLanguages(t *testing.T) { + for alias, name := range unsupportedByEnryAliasMap { + resName, _ := GetLanguageByNameOrAlias(alias) + require.Equal(t, name, resName, + "maybe a typo in `unsupportedByEnryAliasMap`?") + } +} + +func TestGetLanguageByAlias_NonAmbiguousLanguages(t *testing.T) { + for _, language := range nonAmbiguousExtensionsCheck { + _, ok := GetLanguageByNameOrAlias(language) + require.True(t, ok, + "unable to find language %s in go-enry", language) + } +} + +func TestGetLanguageExtensions_UnsupportedExtensions(t *testing.T) { + for language, exts := range unsupportedByEnryNameToExtensionMap { + extensions := GetLanguageExtensions(language) + for _, ext := range exts { + require.Contains(t, extensions, ext, + "maybe a typo in `unsupportedByEnryNameToExtensionMap`?") + } + } +} + +func TestGetLanguageExtensions_NonAmbiguousExtensions(t *testing.T) { + langMap := reverseMap(nonAmbiguousExtensionsCheck) + for language, exts := range langMap { + extensions := GetLanguageExtensions(language) + for _, ext := range exts { + require.Contains(t, extensions, ext, + "If this test fails when updating enry, maybe `overrideAmbiguousExtensionsMap` needs updating") + } + } +} + +func TestGetLanguagesByExtension_UnsupportedExtensions(t *testing.T) { + for ext, language := range unsupportedByEnryExtensionToNameMap { + filename := "foo" + ext + languages, _ := getLanguagesByExtension(filename) + require.Contains(t, languages, language, + "maybe a typo in `unsupportedByEnryExtensionToNameMap`?") + } +} + +func TestGetLanguagesByExtension_OverrideExtensions(t *testing.T) { + for ext, language := range overrideAmbiguousExtensionsMap { + filename := "foo" + ext + enryLangs := enry.GetLanguagesByExtension(filename, nil, nil) + require.Contains(t, enryLangs, language, + "maybe a typo in `overrideAmbiguousExtensionsMap`?") + require.Greaterf(t, len(enryLangs), 1, + "extension %v is not ambiguous according to enry, remove it from `overrideAmbiguousExtensionsMap`", + ext) + } +} + +func TestGetLanguagesByExtension_NonAmbiguousExtensions(t *testing.T) { + for ext, language := range nonAmbiguousExtensionsCheck { + filename := "foo" + ext + languages, isLikelyBinaryFile := getLanguagesByExtension(filename) + require.False(t, isLikelyBinaryFile) + require.Equal(t, []string{language}, languages, + "If this test fails when updating enry, maybe `overrideAmbiguousExtensionsMap` needs updating") + } +} + +func TestGetLanguagesByExtension_BinaryExtensions(t *testing.T) { + for _, ext := range []string{".png", ".jpg", ".gif"} { + filename := "foo" + ext + _, isLikelyBinary := getLanguagesByExtension(filename) + require.Truef(t, isLikelyBinary, "filename: %v was not guessed to be binary;"+ + "bug in extension matching logic in getLanguagesByExtension maybe?", + filename) + } +} + +func TestExtensionsConsistency(t *testing.T) { + for ext, overrideLang := range overrideAmbiguousExtensionsMap { + filepath := "foo" + ext + enryLangsForExt := enry.GetLanguagesByExtension(filepath, nil, nil) + require.Containsf(t, enryLangsForExt, overrideLang, "overrideAmbiguousExtensionsMap maps extension %q to language %q but "+ + "that mapping is not present in enry's list %v", ext, overrideLang, enryLangsForExt) + require.Greaterf(t, len(enryLangsForExt), 1, "overrideAmbiguousExtensionsMap states that"+ + "%q extension is ambiguous, but only found langs: %v", ext, enryLangsForExt) + + candidates, isLikelyBinary := getLanguagesByExtension(filepath) + require.False(t, isLikelyBinary, "ambiguous files are all source code") + require.True(t, len(candidates) == 1, "getLanguagesByExtension should respect overrideAmbiguousExtensionsMap") + + shouldBeIgnoredLangsForExt := slices.DeleteFunc(enryLangsForExt, func(s string) bool { + return s == overrideLang + }) + for _, shouldBeIgnoredLang := range shouldBeIgnoredLangsForExt { + ignoredExts, found := nicheExtensionUsages[shouldBeIgnoredLang] + require.Truef(t, found, "expected lang: %q to have an entry in nicheExtensionUsages for consistency with GetLanguagesByExtension", shouldBeIgnoredLang) + require.Truef(t, len(ignoredExts) >= 1, "sets in nicheExtensionUsages must be non-empty") + + nonNicheExts := GetLanguageExtensions(shouldBeIgnoredLang) + for ignoredExt := range ignoredExts { + require.Falsef(t, slices.Contains(nonNicheExts, ignoredExt), + "GetLanguageExtensions should not return %q for lang %q for consistency with GetLanguagesByExtension", + ignoredExt, shouldBeIgnoredLang) + } + } + } +} + +func TestExtensionsConsistency2(t *testing.T) { + for lang := range enrydata.ExtensionsByLanguage { + for _, ext := range GetLanguageExtensions(lang) { + if strings.Count(ext, ".") > 1 { + // Ignore unusual edge cases like .coffee.md for Literate CoffeeScript + continue + } + langsByExt, isLikelyBinary := getLanguagesByExtension("foo" + ext) + if !isLikelyBinary { + require.Truef(t, slices.Contains(langsByExt, lang), + "expected getLanguagesByExtension result %v to contain %q (extension: %q)", langsByExt, lang, ext) + } + } + } +} + +func TestUnsupportedByEnry(t *testing.T) { + for lang := range unsupportedByEnryNameToExtensionMap { + enry_extensions, found := enrydata.ExtensionsByLanguage[lang] + if found { + validateLanguageAgainstGoEnry(t, "unsupportedByEnryNameToExtensionMap", enry_extensions, lang) + } + } + for _, lang := range unsupportedByEnryAliasMap { + enry_extensions, found := enrydata.ExtensionsByLanguage[lang] + if found { + validateLanguageAgainstGoEnry(t, "unsupportedByEnryAliasMap", enry_extensions, lang) + } + } + for _, lang := range unsupportedByEnryExtensionToNameMap { + enry_extensions, found := enrydata.ExtensionsByLanguage[lang] + if found { + validateLanguageAgainstGoEnry(t, "unsupportedByEnryExtensionToNameMap", enry_extensions, lang) + } + } +} + +func validateLanguageAgainstGoEnry(t *testing.T, name string, enryExtensions []string, lang string) { + enryExtensions = slices.Clone(enryExtensions) + slices.Sort(enryExtensions) + sgExtensions := slices.Clone(unsupportedByEnryNameToExtensionMap[lang]) + slices.Sort(sgExtensions) + + require.NotEqualf(t, enryExtensions, sgExtensions, "looks like language %q is supported by enry with the same extensions; remove it from %q", lang, name) +} diff --git a/internal/languages/language.go b/internal/languages/language.go deleted file mode 100644 index ec76d945..00000000 --- a/internal/languages/language.go +++ /dev/null @@ -1,74 +0,0 @@ -// This file wraps the logic of go-enry (https://github.com/go-enry/go-enry) to support additional languages. -// go-enry is based off of a package called Linguist (https://github.com/github/linguist) -// and sometimes programming languages may not be supported by Linguist -// or may take a while to get merged in and make it into go-enry. This wrapper -// gives us flexibility to support languages in those cases. We list additional languages -// in this file and remove them once they make it into Linguist and go-enry. -// This logic is similar to what we have in the sourcegraph/sourcegraph repo, in the future -// we plan to refactor both into a common library to share between the two repos. -package languages - -import ( - "path/filepath" - "strings" - - "github.com/go-enry/go-enry/v2" -) - -var unsupportedByLinguistAliasMap = map[string]string{ - // Extensions for the Apex programming language - // See https://developer.salesforce.com/docs/atlas.en-us.apexcode.meta/apexcode/apex_dev_guide.htm - "apex": "Apex", - // Pkl Configuration Language (https://pkl-lang.org/) - // Add to linguist on 6/7/24 - // can remove once go-enry package updates - // to that linguist version - "pkl": "Pkl", - // Magik Language - "magik": "Magik", -} - -var unsupportedByLinguistExtensionToNameMap = map[string]string{ - ".apex": "Apex", - ".apxt": "Apex", - ".apxc": "Apex", - ".cls": "Apex", - ".trigger": "Apex", - // Pkl Configuration Language (https://pkl-lang.org/) - ".pkl": "Pkl", - // Magik Language - ".magik": "Magik", -} - -// getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias -// It supports languages that are missing in linguist -func GetLanguageByAlias(alias string) (language string, ok bool) { - language, ok = enry.GetLanguageByAlias(alias) - if !ok { - normalizedAlias := strings.ToLower(alias) - language, ok = unsupportedByLinguistAliasMap[normalizedAlias] - } - - return -} - -// GetLanguage is a replacement for enry.GetLanguage -// to find out the most probable language to return but includes support -// for languages missing from linguist -func GetLanguage(filename string, content []byte) (language string) { - language = enry.GetLanguage(filename, content) - - // If go-enry failed to find language, fall back on our - // internal check for languages missing in linguist - if language == "" { - ext := filepath.Ext(filename) - normalizedExt := strings.ToLower(ext) - if ext == "" { - return - } - if lang, ok := unsupportedByLinguistExtensionToNameMap[normalizedExt]; ok { - language = lang - } - } - return -} diff --git a/internal/languages/language_test.go b/internal/languages/language_test.go deleted file mode 100644 index 25e2382a..00000000 --- a/internal/languages/language_test.go +++ /dev/null @@ -1,107 +0,0 @@ -package languages - -import "testing" - -func TestGetLanguageByAlias(t *testing.T) { - tests := []struct { - name string - alias string - want string - wantOk bool - }{ - { - name: "empty alias", - alias: "", - want: "", - wantOk: false, - }, - { - name: "unknown alias", - alias: "unknown", - want: "", - wantOk: false, - }, - { - name: "supported alias", - alias: "go", - want: "Go", - wantOk: true, - }, - { - name: "unsupported by linguist alias", - alias: "magik", - want: "Magik", - wantOk: true, - }, - { - name: "unsupported by linguist alias normalized", - alias: "mAgIk", - want: "Magik", - wantOk: true, - }, - { - name: "apex example unsupported by linguist alias", - alias: "apex", - want: "Apex", - wantOk: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, ok := GetLanguageByAlias(tt.alias) - if got != tt.want || ok != tt.wantOk { - t.Errorf("GetLanguageByAlias(%q) = %q, %t, want %q, %t", tt.alias, got, ok, tt.want, tt.wantOk) - } - }) - } -} - -func TestGetLanguage(t *testing.T) { - tests := []struct { - name string - filename string - content []byte - want string - }{ - { - name: "empty filename", - filename: "", - content: []byte(""), - want: "", - }, - { - name: "unknown extension", - filename: "file.unknown", - content: []byte(""), - want: "", - }, - { - name: "supported extension", - filename: "file.go", - content: []byte("package main"), - want: "Go", - }, - { - name: "magik: unsupported by linguist extension", - filename: "file.magik", - content: []byte(""), - want: "Magik", - }, - { - name: "apex: unsupported by linguist extension", - filename: "file.apxc", - content: []byte(""), - want: "Apex", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := GetLanguage(tt.filename, tt.content) - if got != tt.want { - t.Errorf("GetLanguage(%q, %q) = %q, want %q", tt.filename, tt.content, got, tt.want) - } - }) - } -} diff --git a/internal/languages/languages.go b/internal/languages/languages.go new file mode 100644 index 00000000..73854ebc --- /dev/null +++ b/internal/languages/languages.go @@ -0,0 +1,134 @@ +package languages + +import ( + "slices" + "strings" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry. +) + +// Make sure all names are lowercase here, since they are normalized +var enryLanguageMappings = map[string]string{ + "c#": "c_sharp", +} + +func NormalizeLanguage(filetype string) string { + normalized := strings.ToLower(filetype) + if mapped, ok := enryLanguageMappings[normalized]; ok { + normalized = mapped + } + + return normalized +} + +// GetMostLikelyLanguage returns the language for the given path and contents. +// +// Prefer using GetLanguages instead of this function. +// +// TODO: Remove the extra normalization this functiond does over GetLanguages +func GetMostLikelyLanguage(path, contents string) (lang string, found bool) { + languages, _ := GetLanguages(path, func() ([]byte, error) { + if len(contents) > 2048 { + return []byte(contents[:2048]), nil + } + return []byte(contents), nil + }) + for _, lang := range languages { + if lang != "" { + return NormalizeLanguage(lang), true + } + } + return "", false +} + +// GetLanguages is a replacement for enry.GetLanguages which +// avoids incorrect fallback behavior that is present in DefaultStrategies, +// where it will misclassify '.h' header files as C when file contents +// are not available. +// +// The content can be optionally passed via a callback instead of +// directly, so that in the common case, the caller can avoid fetching +// the content. The full content returned by getContent will be used for +// language detection. +// +// getContent is not called if the file is likely to be a binary file, +// as enry only covers programming languages. +// +// Returns: +// - An error if the getContent func returns an error +// - An empty slice if language detection failed +// - A single-element slice if the language was determined exactly +// - A multi-element slice if the language was ambiguous. For example, +// for simple `.h` files with just comments and macros, they may +// be valid C, C++ or any of their derivative languages (e.g. Objective-C). +func GetLanguages(path string, getContent func() ([]byte, error)) ([]string, error) { + impl := func() ([]string, error) { + langs := enry.GetLanguagesByFilename(path, nil, nil) + if len(langs) == 1 { + return langs, nil + } + newLangs, isLikelyBinaryFile := getLanguagesByExtension(path) + if isLikelyBinaryFile { + return nil, nil + } + switch len(newLangs) { + case 0: + break + case 1: + return newLangs, nil + default: + langs = newLangs + } + if getContent == nil { + return langs, nil + } + content, err := getContent() + if err != nil { + return nil, err + } + if len(content) == 0 { + return langs, nil + } + if enry.IsBinary(content) { + return nil, nil + } + + // enry doesn't expose a way to call GetLanguages with a specific set of + // strategies, so just hand-roll that code here. + var languages = langs + for _, strategy := range []enry.Strategy{enry.GetLanguagesByModeline, getLanguagesByShebang, enry.GetLanguagesByContent, enry.GetLanguagesByClassifier} { + candidates := strategy(path, content, languages) + switch len(candidates) { + case 0: + continue + case 1: + return candidates, nil + default: + languages = candidates + } + } + + return languages, nil + } + + langs, err := impl() + return slices.Clone(langs), err +} + +// getLanguagesByShebang is a replacement for enry.GetLanguagesByShebang. +// +// The enry function considers non-programming languages such as 'Pod'/'Pod 6' +// also for shebangs, so work around that. +func getLanguagesByShebang(path string, content []byte, candidates []string) []string { + languages := enry.GetLanguagesByShebang(path, content, candidates) + if len(languages) == 2 { + // See https://sourcegraph.com/github.com/go-enry/go-enry@40f2a1e5b90eec55c20441c2a5911dcfc298a447/-/blob/data/interpreter.go?L95-96 + if slices.Equal(languages, []string{"Perl", "Pod"}) { + return []string{"Perl"} + } + if slices.Equal(languages, []string{"Pod 6", "Raku"}) { + return []string{"Raku"} + } + } + return slices.Clone(languages) +} diff --git a/internal/languages/languages_test.go b/internal/languages/languages_test.go new file mode 100644 index 00000000..def950d6 --- /dev/null +++ b/internal/languages/languages_test.go @@ -0,0 +1,94 @@ +package languages + +import ( + "testing" + + "github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry + "github.com/stretchr/testify/require" + "pgregory.net/rapid" +) + +func TestGetLanguages(t *testing.T) { + const matlabContent = "function [out] = square(x)\nout = x * x;\nend" + const mathematicaContent = "f[x_] := x ^ 2\ng[y_] := f[y]" + const cppContent = "namespace x { }" + const cContent = "typedef struct { int x; } Int;" + const emptyContent = "" + + testCases := []struct { + path string + content string + expectedLanguages []string + compareFirstOnly bool + }{ + {path: "perlscript", content: "#!/usr/bin/env perl\n$version = $ARGV[0];", expectedLanguages: []string{"Perl"}}, + {path: "rakuscript", content: "#!/usr/bin/env perl6\n$version = $ARGV[0];", expectedLanguages: []string{"Raku"}}, + {path: "ambiguous.h", content: emptyContent, expectedLanguages: []string{"C", "C++", "Objective-C"}}, + {path: "cpp.h", content: cppContent, expectedLanguages: []string{"C++"}}, + {path: "c.h", content: cContent, expectedLanguages: []string{"C"}}, + {path: "matlab.m", content: matlabContent, expectedLanguages: []string{"MATLAB"}, compareFirstOnly: true}, + {path: "mathematica.m", content: mathematicaContent, expectedLanguages: []string{"Mathematica"}, compareFirstOnly: true}, + { + path: "mathematica2.m", + content: ` +s := StringRiffle[{"a", "b", "c", "d", "e"}, ", "] +Flatten[{{a, b}, {c, {d}, e}, {f, {g, h}}}] +square[x_] := x ^ 2 +fourthpower[x_] := square[square[x]] +`, + expectedLanguages: []string{"Mathematica"}, + compareFirstOnly: true, + }, + } + + for _, testCase := range testCases { + var getContent func() ([]byte, error) + if testCase.content != "" { + getContent = func() ([]byte, error) { return []byte(testCase.content), nil } + } + gotLanguages, err := GetLanguages(testCase.path, getContent) + require.NoError(t, err) + if testCase.compareFirstOnly { + require.Equal(t, testCase.expectedLanguages, gotLanguages[0:1]) + continue + } + require.Equal(t, testCase.expectedLanguages, gotLanguages) + } + + rapid.Check(t, func(t *rapid.T) { + path := rapid.String().Draw(t, "path") + content := rapid.SliceOfN(rapid.Byte(), 0, 100).Draw(t, "contents") + require.NotPanics(t, func() { + langs, err := GetLanguages(path, func() ([]byte, error) { return content, nil }) + require.NoError(t, err) + if len(langs) != 0 { + for _, l := range langs { + require.NotEqual(t, enry.OtherLanguage, l) + } + } + }) + }) + + rapid.Check(t, func(t *rapid.T) { + baseName := "abcd" + exts := []string{".h", ".m", ".unknown", ""} + extGens := []*rapid.Generator[string]{} + for _, ext := range exts { + extGens = append(extGens, rapid.Just(ext)) + } + extension := rapid.OneOf(extGens...).Draw(t, "extension") + path := baseName + extension + contentGens := []*rapid.Generator[string]{} + for _, content := range []string{cContent, cppContent, mathematicaContent, matlabContent, emptyContent} { + contentGens = append(contentGens, rapid.Just(content)) + } + content := rapid.OneOf(contentGens...).Draw(t, "content") + langs, err := GetLanguages(path, func() ([]byte, error) { + return []byte(content), nil + }) + require.NoError(t, err) + for _, lang := range langs { + require.NotEqual(t, enry.OtherLanguage, lang) + } + }) +} diff --git a/query/parse.go b/query/parse.go index d8762f19..e5079bb3 100644 --- a/query/parse.go +++ b/query/parse.go @@ -172,7 +172,7 @@ func parseExpr(in []byte) (Q, int, error) { } expr = q case tokLang: - canonical, ok := languages.GetLanguageByAlias(text) + canonical, ok := languages.GetLanguageByNameOrAlias(text) if !ok { expr = &Const{false} } else {