diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..cf55c19 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,222 @@ +name: build + +on: [push] + +jobs: + build: + name: ${{ matrix.config.name }} + runs-on: ${{ matrix.config.os }} + strategy: + fail-fast: false + matrix: + config: + - { + name: "Windows Latest MSVC", artifact: "Windows.zip", exe_path: "search_engine.exe", + os: windows-latest, + build_type: "Release", cc: "cl", cxx: "cl", + environment_script: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise/VC/Auxiliary/Build/vcvars64.bat" + } + - { + name: "Ubuntu Latest GCC", artifact: "Linux.zip", exe_path: "search_engine", + os: ubuntu-latest, + build_type: "Release", cc: "gcc", cxx: "g++" + } + - { + name: "macOS Latest Clang", artifact: "macOS.zip", exe_path: "search_engine", + os: macos-latest, + build_type: "Release", cc: "clang", cxx: "clang++" + } + + steps: + - uses: actions/checkout@v1 + + - name: Prepare for Conan + run: | + mkdir build + pip3 install conan + + - name: Conan + working-directory: build + run: | + conan install .. --build=missing + + - name: Configure + shell: cmake -P {0} + run: | + set(ENV{CC} ${{ matrix.config.cc }}) + set(ENV{CXX} ${{ matrix.config.cxx }}) + + if ("${{ runner.os }}" STREQUAL "Windows" AND NOT "x${{ matrix.config.environment_script }}" STREQUAL "x") + execute_process( + COMMAND "${{ matrix.config.environment_script }}" && set + OUTPUT_FILE environment_script_output.txt + ) + file(STRINGS environment_script_output.txt output_lines) + foreach(line IN LISTS output_lines) + if (line MATCHES "^([a-zA-Z0-9_-]+)=(.*)$") + set(ENV{${CMAKE_MATCH_1}} "${CMAKE_MATCH_2}") + endif() + endforeach() + endif() + + execute_process( + COMMAND cmake + -S . + -B build + -D CMAKE_BUILD_TYPE=${{ matrix.config.build_type }} + -D ENABLE_TESTS=ON + RESULT_VARIABLE result + ) + + if (NOT result EQUAL 0) + message(FATAL_ERROR "Bad exit status") + endif() + + + - name: Build + shell: cmake -P {0} + run: | + + if ("${{ runner.os }}" STREQUAL "Windows" AND NOT "x${{ matrix.config.environment_script }}" STREQUAL "x") + file(STRINGS environment_script_output.txt output_lines) + foreach(line IN LISTS output_lines) + if (line MATCHES "^([a-zA-Z0-9_-]+)=(.*)$") + set(ENV{${CMAKE_MATCH_1}} "${CMAKE_MATCH_2}") + endif() + endforeach() + endif() + + execute_process( + COMMAND cmake --build build --config ${{ matrix.config.build_type }} + RESULT_VARIABLE result + ) + if (NOT result EQUAL 0) + message(FATAL_ERROR "Bad exit status") + endif() + + + - name: Run tests + working-directory: build + shell: cmake -P {0} + run: | + + if ("${{ runner.os }}" STREQUAL "Windows") + execute_process( + COMMAND bin/search_engine_test.exe + RESULT_VARIABLE result + ) + endif() + if (NOT "${{ runner.os }}" STREQUAL "Windows") + execute_process( + COMMAND ./bin/search_engine_test + RESULT_VARIABLE result + ) + endif() + + if (NOT result EQUAL 0) + message(FATAL_ERROR "Bad exit status ") + endif() + + - name: Pack + shell: cmake -P {0} + run: | + if ("${{ runner.os }}" STREQUAL "Windows") + execute_process( + COMMAND cmake -E tar "cfv" ${{ matrix.config.artifact }} --format=zip ${{ matrix.config.exe_path }} + WORKING_DIRECTORY ./build/bin + ) + execute_process( + COMMAND mv ./build/bin/${{ matrix.config.artifact }} ${{ matrix.config.artifact }} + ) + endif() + if (NOT "${{ runner.os }}" STREQUAL "Windows") + execute_process( + COMMAND cmake -E tar "cfv" ${{ matrix.config.artifact }} --format=zip ${{ matrix.config.exe_path }} + WORKING_DIRECTORY ./build/bin + ) + execute_process( + COMMAND mv ./build/bin/${{ matrix.config.artifact }} ./${{ matrix.config.artifact }} + ) + endif() + + - name: Upload + uses: actions/upload-artifact@v1 + with: + path: ./${{ matrix.config.artifact }} + name: ${{ matrix.config.artifact }} + + release: + if: contains(github.ref, 'tags/v') + runs-on: ubuntu-latest + needs: build + + steps: + - name: Create Release + id: create_release + uses: actions/create-release@v1.0.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref }} + release_name: Release ${{ github.ref }} + draft: false + prerelease: false + + - name: Store Release url + run: | + echo "${{ steps.create_release.outputs.upload_url }}" > ./upload_url + + - uses: actions/upload-artifact@v1 + with: + path: ./upload_url + name: upload_url + + publish: + if: contains(github.ref, 'tags/v') + name: ${{ matrix.config.name }} + runs-on: ${{ matrix.config.os }} + strategy: + fail-fast: false + matrix: + config: + - { + name: "Windows Latest MSVC", artifact: "Windows.zip", exe_path: search_engine.exe, + os: ubuntu-latest + } + - { + name: "Ubuntu Latest GCC", artifact: "Linux.zip", exe_path: search_engine, + os: ubuntu-latest + } + - { + name: "macOS Latest Clang", artifact: "macOS.zip", exe_path: search_engine, + os: ubuntu-latest + } + needs: release + + steps: + - name: Download artifact + uses: actions/download-artifact@v1 + with: + name: ${{ matrix.config.artifact }} + path: ./ + + - name: Download URL + uses: actions/download-artifact@v1 + with: + name: upload_url + path: ./ + - id: set_upload_url + run: | + upload_url=`cat ./upload_url` + echo ::set-output name=upload_url::$upload_url + + - name: Upload to Release + id: upload_to_release + uses: actions/upload-release-asset@v1.0.1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.set_upload_url.outputs.upload_url }} + asset_path: ./${{ matrix.config.artifact }} + asset_name: ${{ matrix.config.artifact }} + asset_content_type: application/x-gtar diff --git a/.github/workflows/sonar.yml b/.github/workflows/sonar.yml new file mode 100644 index 0000000..99828d1 --- /dev/null +++ b/.github/workflows/sonar.yml @@ -0,0 +1,61 @@ +name: SonarQube +on: + push: + branches: + - master + pull_request: + types: [opened, synchronize, reopened] +jobs: + build: + name: Build + runs-on: ubuntu-latest + env: + SONAR_SCANNER_VERSION: 4.7.0.2747 + SONAR_SERVER_URL: "https://sonarcloud.io" + BUILD_WRAPPER_OUT_DIR: build_wrapper_output_directory # Directory where build-wrapper output will be placed + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis + - name: Set up JDK 11 + uses: actions/setup-java@v1 + with: + java-version: 11 + - name: Download and set up sonar-scanner + env: + SONAR_SCANNER_DOWNLOAD_URL: https://binaries.sonarsource.com/Distribution/sonar-scanner-cli/sonar-scanner-cli-${{ env.SONAR_SCANNER_VERSION }}-linux.zip + run: | + mkdir -p $HOME/.sonar + curl -sSLo $HOME/.sonar/sonar-scanner.zip ${{ env.SONAR_SCANNER_DOWNLOAD_URL }} + unzip -o $HOME/.sonar/sonar-scanner.zip -d $HOME/.sonar/ + echo "$HOME/.sonar/sonar-scanner-${{ env.SONAR_SCANNER_VERSION }}-linux/bin" >> $GITHUB_PATH + - name: Download and set up build-wrapper + env: + BUILD_WRAPPER_DOWNLOAD_URL: ${{ env.SONAR_SERVER_URL }}/static/cpp/build-wrapper-linux-x86.zip + run: | + curl -sSLo $HOME/.sonar/build-wrapper-linux-x86.zip ${{ env.BUILD_WRAPPER_DOWNLOAD_URL }} + unzip -o $HOME/.sonar/build-wrapper-linux-x86.zip -d $HOME/.sonar/ + echo "$HOME/.sonar/build-wrapper-linux-x86" >> $GITHUB_PATH + - name: Run build-wrapper + run: | + mkdir build + cd build + pip install conan + conan install .. --build=missing + cd .. + cmake -DENABLE_TESTS=ON -DCMAKE_CXX_FLAGS=--coverage -S . -B build + build-wrapper-linux-x86-64 --out-dir ${{ env.BUILD_WRAPPER_OUT_DIR }} cmake --build build/ --config Debug + - name: Run tests + working-directory: build + run: | + ./bin/search_engine_test + - name: Collect coverage into one XML report + run: | + sudo apt-get install -y gcovr + gcovr --sonarqube > coverage.xml + - name: Run sonar-scanner + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} + run: | + sonar-scanner --define sonar.host.url="${{ env.SONAR_SERVER_URL }}" --define sonar.cfamily.build-wrapper-output="${{ env.BUILD_WRAPPER_OUT_DIR }}" --define sonar.coverageReportPaths=coverage.xml diff --git a/.gitignore b/.gitignore index 4a25d57..bd5768d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,9 @@ /cmake-build-debug/ +/.idea/.gitignore +/.idea/misc.xml +/.idea/modules.xml +/.idea/search_engine.iml +/.idea/vcs.xml +#/resources/ +/cmake-build-debug-coverage/ +/.idea/* diff --git a/CMakeLists.txt b/CMakeLists.txt index fadb50d..cb00a11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,51 @@ -cmake_minimum_required(VERSION 3.21) +cmake_minimum_required(VERSION 3.16) project(search_engine) include(FetchContent) +include(${CMAKE_CURRENT_BINARY_DIR}/conanbuildinfo.cmake) +conan_basic_setup() -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) +message("-- ENABLE_TESTS: " ${ENABLE_TESTS}) -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz) +find_package(Threads REQUIRED) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +set(Boost_NO_WARN_NEW_VERSIONS 1) +set(Boost_USE_STATIC_LIBS ON) +find_package(Boost COMPONENTS thread system REQUIRED) +include_directories(${Boost_INCLUDE_DIR}) + +FetchContent_Declare( + json + URL + https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz +) FetchContent_MakeAvailable(json) -add_executable(search_engine main.cpp ConverterJSON.cpp ConverterJSON.h) +if(ENABLE_TESTS) + FetchContent_Declare( + googletest + URL + https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip + ) + FetchContent_MakeAvailable(googletest) + + include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) + set(gtest_disable_pthreads on) + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + + enable_testing() + + add_executable(search_engine_test src/ConverterJSON.cpp tests/tests.cpp src/InvertedIndex.cpp src/SearchServer.cpp) + target_link_libraries(search_engine_test PRIVATE nlohmann_json::nlohmann_json gtest_main Threads::Threads ${Boost_LIBRARIES}) + + include(GoogleTest) + gtest_discover_tests(search_engine_test) + +endif() + +add_executable(search_engine src/main.cpp src/ConverterJSON.cpp src/InvertedIndex.cpp src/SearchServer.cpp ) +target_link_libraries(search_engine PRIVATE nlohmann_json::nlohmann_json Threads::Threads ${Boost_LIBRARIES}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/search_engine DESTINATION .) + -target_link_libraries(search_engine PRIVATE nlohmann_json::nlohmann_json) \ No newline at end of file diff --git a/ConverterJSON.cpp b/ConverterJSON.cpp deleted file mode 100644 index d5c5892..0000000 --- a/ConverterJSON.cpp +++ /dev/null @@ -1,11 +0,0 @@ -// -// Created by ksv on 27.03.2022. -// - -#include "ConverterJSON.h" - -std::vector ConverterJSON::GetTextDocuments() { - std::vector response = std::vector(); - - return response; -} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..98c3250 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Kulaga Sergey + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..85980ee --- /dev/null +++ b/README.md @@ -0,0 +1,65 @@ +[![CI](https://github.com/ksv87/search_engine/actions/workflows/build.yml/badge.svg)](https://github.com/ksv87/search_engine/actions/workflows/build.yml) +[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=ksv87_search_engine&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=ksv87_search_engine) +[![Coverage](https://sonarcloud.io/api/project_badges/measure?project=ksv87_search_engine&metric=coverage)](https://sonarcloud.io/summary/new_code?id=ksv87_search_engine) +![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20mac%20os-deepgreen) +![License](https://img.shields.io/badge/license-MIT-deepgreen) + +# search_engine + +This is a simple search engine for corporation use + +# the stack of technologies used + +C++ 17 - language standard
+CMake - build system
+Conan - package manager
+Python 3 - for conan
+https://github.com/nlohmann/json - library for working with JSON
+https://github.com/google/googletest - library for testing
+https://www.boost.org/doc/libs/1_66_0/doc/html/boost_asio/reference/thread_pool.html - library for working with multithreading + +# build from source + +``` bash +git clone https://github.com/ksv87/search_engine.git +cd search_engine +mkdir build +cd build +conan install .. --build=missing +cmake .. +cmake --build . +``` + +for enable testing +``` bash +cmake -DENABLE_TESTS=ON .. +cmake --build . +``` + +for change build type +``` bash +cmake -DCMAKE_BUILD_TYPE=Release .. +cmake --build . --config Release +``` + +# configuration files + +.\resources\config.json - main configuration file
+``` bash +field max_responses - count of max responses to request +field files - files to indexing +``` +.\resources\requests.json - file with requests
+.\resources\answers.json - file with results
+ +# run tests + +``` bash +./bin/search_engine_test +``` + +# run + +``` bash +./bin/search_engine +``` diff --git a/conanfile.txt b/conanfile.txt new file mode 100644 index 0000000..5482f84 --- /dev/null +++ b/conanfile.txt @@ -0,0 +1,5 @@ +[requires] +boost/1.79.0 + +[generators] +cmake diff --git a/ConverterJSON.h b/include/ConverterJSON.h similarity index 54% rename from ConverterJSON.h rename to include/ConverterJSON.h index cfb0f4e..dbce3f3 100644 --- a/ConverterJSON.h +++ b/include/ConverterJSON.h @@ -1,44 +1,73 @@ -#include -#include -#include - -#include "nlohmann/json.hpp" - -struct config_json { - struct { - std::string name; - std::string version; - int max_responses; - } config; - std::vector files; -}; - -/** -* Класс для работы с JSON-файлами -*/ -class ConverterJSON { -public: - ConverterJSON() = default; -/** -* Метод получения содержимого файлов -* @return Возвращает список с содержимым файлов перечисленных -* в config.json -*/ - std::vector GetTextDocuments(); -/** -* Метод считывает поле max_responses для определения предельного -* количества ответов на один запрос -* @return -*/ - int GetResponsesLimit(); -/** -* Метод получения запросов из файла requests.json -* @return возвращает список запросов из файла requests.json -*/ - std::vector GetRequests(); -/** -* Положить в файл answers.json результаты поисковых запросов -*/ - void putAnswers(std::vector>> - answers); +#include +#include +#include +#include + +#include "nlohmann/json.hpp" +#include "SearchServer.h" + +struct config_json { + struct { + std::string name; + std::string version; + int max_responses; + } config; + std::vector files; +}; + +struct request_json { + std::vector requests; +}; + +struct relevance_json { + int docid; + float rank; +}; + +/** +* Класс для работы с JSON-файлами +*/ +class ConverterJSON { +public: + config_json cf; + request_json req; + + ConverterJSON() = default; + +/** +* Метод читает конфигурационный файл +* @return Возвращает признак успешно ли прочитан конфиг +* из config.json +*/ + bool LoadConfig(); + +/** +* Метод читает файл с запросами для поиска +* @return Возвращает признак успешно ли прочитаны запросы +* из requests.json +*/ + bool LoadRequests(); +/** +* Метод получения содержимого файлов +* @return Возвращает список с содержимым файлов перечисленных +* в config.json +*/ + std::vector GetTextDocuments() const; +/** +* Метод считывает поле max_responses для определения предельного +* количества ответов на один запрос +* @return +*/ + int GetResponsesLimit() const; +/** +* Метод получения запросов из файла requests.json +* @return возвращает список запросов из файла requests.json +*/ + std::vector GetRequests() const; +/** +* Положить в файл answers.json результаты поисковых запросов +*/ + void PutAnswers(const std::vector>& + answers) const; + }; \ No newline at end of file diff --git a/include/InvertedIndex.h b/include/InvertedIndex.h new file mode 100644 index 0000000..fa6ff05 --- /dev/null +++ b/include/InvertedIndex.h @@ -0,0 +1,51 @@ +// +// Created by ksv on 31.03.2022. +// + +#ifndef SEARCH_ENGINE_INVERTEDINDEX_H +#define SEARCH_ENGINE_INVERTEDINDEX_H + + +#include +#include +#include +#include +#include +#include +#include +#include + +struct Entry { + size_t doc_id; + size_t count; +// Данный оператор необходим для проведения тестовых сценариев + bool operator ==(const Entry& other) const { + return (doc_id == other.doc_id && + count == other.count); + } +}; + +class InvertedIndex { + std::mutex* mutex; +public: + +/** +* Обновить или заполнить базу документов, по которой будем совершать +поиск +* @param texts_input содержимое документов +*/ + void UpdateDocumentBase(std::vector input_docs); +/** +* Метод определяет количество вхождений слова word в загруженной базе +документов +* @param word слово, частоту вхождений которого необходимо определить +* @return возвращает подготовленный список с частотой слов +*/ + std::vector GetWordCount(const std::string& word); +private: + std::vector docs; // список содержимого документов + mutable std::map> freq_dictionary; // частотный словарь + void RunThread(size_t docIndex, const std::string &doc); +}; + +#endif //SEARCH_ENGINE_INVERTEDINDEX_H diff --git a/include/SearchServer.h b/include/SearchServer.h new file mode 100644 index 0000000..f73aef9 --- /dev/null +++ b/include/SearchServer.h @@ -0,0 +1,41 @@ +// +// Created by ksv on 05.04.2022. +// + +#ifndef SEARCH_ENGINE_SEARCHSERVER_H +#define SEARCH_ENGINE_SEARCHSERVER_H + +#include "InvertedIndex.h" + +struct RelativeIndex{ + size_t doc_id; + float rank; + bool operator ==(const RelativeIndex& other) const { + return (doc_id == other.doc_id && rank == other.rank); + } + RelativeIndex(size_t doc_id, float rank): doc_id(doc_id), rank(rank){}; +}; + +class SearchServer { + public: + /** + * @param idx в конструктор класса передаётся ссылка на класс + InvertedIndex, + * чтобы SearchServer мог узнать частоту слов встречаемых в + запросе + */ + SearchServer(const InvertedIndex& idx) : _index(idx){ }; + /** + * Метод обработки поисковых запросов + * @param queries_input поисковые запросы взятые из файла + requests.json + * @return возвращает отсортированный список релевантных ответов для + заданных запросов + */ + std::vector> search(const std::vector& queries_input); + private: + InvertedIndex _index; + }; + + +#endif //SEARCH_ENGINE_SEARCHSERVER_H diff --git a/main.cpp b/main.cpp deleted file mode 100644 index 7f1d32f..0000000 --- a/main.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include - -#include "ConverterJSON.h" - -int main() { - - auto config_file = std::ifstream("config.json"); - - if(!config_file.is_open()){ - std::cerr << "config file is missing." << std::endl; - return 1; - } - - nlohmann::json conf; - config_file >> conf; - - auto cf = config_json(); - - if(conf["config"].is_null()){ - std::cerr << "config file is empty." << std::endl; - return 1; - } - - if(!conf["config"]["name"].is_null()){ - cf.config.name = conf["config"]["name"]; - }else{ - cf.config.name = "Undefined"; - } - if(!conf["config"]["version"].is_null()) { - cf.config.version = conf["config"]["version"]; - } - cf.config.max_responses = conf["config"]["max_responses"]; - - for (auto f: conf["files"]) { - cf.files.push_back(f); - } - - std::cout << cf.config.name << ((cf.config.version == "")? "" : " version: ") << cf.config.version << std::endl; - -// for (auto f: cf.files) { -// std::cout << f << std::endl; -// } - - return 0; - -} diff --git a/resources/answers.json b/resources/answers.json new file mode 100644 index 0000000..03b7cef --- /dev/null +++ b/resources/answers.json @@ -0,0 +1 @@ +{"answers":{"request001":{"relevance":[{"docid":0,"rank":1.0},{"docid":1,"rank":0.3333333432674408},{"docid":2,"rank":0.3333333432674408}],"result":true},"request002":{"result":false},"request003":{"relevance":[{"docid":2,"rank":1.0}],"result":true}}} \ No newline at end of file diff --git a/resources/config.json b/resources/config.json new file mode 100644 index 0000000..9f397f4 --- /dev/null +++ b/resources/config.json @@ -0,0 +1,12 @@ +{ + "config": { + "name": "Kulaga Sergey SearchEngine", + "version": "0.1", + "max_responses": 5 + }, + "files": [ + "../resources/file001.txt", + "../resources/file002.txt", + "../resources/file003.txt" + ] +} \ No newline at end of file diff --git a/resources/file001.txt b/resources/file001.txt new file mode 100644 index 0000000..f9d5cd3 --- /dev/null +++ b/resources/file001.txt @@ -0,0 +1 @@ +best first file \ No newline at end of file diff --git a/resources/file002.txt b/resources/file002.txt new file mode 100644 index 0000000..0f8bbfe --- /dev/null +++ b/resources/file002.txt @@ -0,0 +1 @@ +second file \ No newline at end of file diff --git a/resources/file003.txt b/resources/file003.txt new file mode 100644 index 0000000..b64aa37 --- /dev/null +++ b/resources/file003.txt @@ -0,0 +1 @@ +third file \ No newline at end of file diff --git a/resources/requests.json b/resources/requests.json new file mode 100644 index 0000000..fb136f4 --- /dev/null +++ b/resources/requests.json @@ -0,0 +1,7 @@ +{ + "requests": [ + "first file best", + "what", + "third" + ] +} diff --git a/sonar-project.properties b/sonar-project.properties new file mode 100644 index 0000000..d316f91 --- /dev/null +++ b/sonar-project.properties @@ -0,0 +1,11 @@ +sonar.projectKey=ksv87_search_engine +sonar.organization=ksv87 +sonar.projectVersion=0.2 + +# Path is relative to the sonar-project.properties file. Replace "\" by "/" on Windows. +sonar.sources=src,include +#sonar.tests=tests + +# Encoding of the source code. Default is default system encoding +sonar.sourceEncoding=UTF-8 + diff --git a/src/ConverterJSON.cpp b/src/ConverterJSON.cpp new file mode 100644 index 0000000..7811959 --- /dev/null +++ b/src/ConverterJSON.cpp @@ -0,0 +1,118 @@ +// +// Created by ksv on 27.03.2022. +// + +#include "ConverterJSON.h" + +bool ConverterJSON::LoadConfig(){ + auto config_file = std::ifstream("../resources/config.json"); + + if(!config_file.is_open()){ + std::cerr << "config file is missing." << std::endl; + return false; + } + + nlohmann::json conf; + config_file >> conf; + + if(conf["config"].is_null()){ + std::cerr << "config file is empty." << std::endl; + return false; + } + + if(!conf["config"]["name"].is_null()){ + cf.config.name = conf["config"]["name"]; + }else{ + cf.config.name = "Undefined"; + } + + if(!conf["config"]["version"].is_null()) { + cf.config.version = conf["config"]["version"]; + }else{ + cf.config.version = "1.0"; + } + + if(!conf["config"]["max_responses"].is_null()){ + cf.config.max_responses = conf["config"]["max_responses"]; + }else{ + cf.config.max_responses = 5; + } + + for (const auto& f: conf["files"]) { + cf.files.push_back(f); + } + + return true; + +} + +std::vector ConverterJSON::GetTextDocuments() const { + auto response = std::vector(); + + for (const auto& f: cf.files) { + auto fl = std::ifstream(f); + std::stringstream ss; + ss << fl.rdbuf(); + response.push_back(ss.str()); + } + + return response; +} + +int ConverterJSON::GetResponsesLimit() const { + return cf.config.max_responses; +} + +bool ConverterJSON::LoadRequests() { + + auto requests_file = std::ifstream("../resources/requests.json"); + + if(!requests_file.is_open()){ + std::cerr << "requests file is missing." << std::endl; + return false; + } + + nlohmann::json requests; + requests_file >> requests; + + if(requests["requests"].is_null()){ + std::cerr << "requests file is empty." << std::endl; + return false; + } + + for (const auto& r: requests["requests"]) { + req.requests.push_back(r); + } + + return true; +} + +std::vector ConverterJSON::GetRequests() const { + return req.requests; +} + +void ConverterJSON::PutAnswers(const std::vector>& answers) const { + + nlohmann::json answ; + int ind = 1; + auto answers_file = std::ofstream("../resources/answers.json",std::ios::trunc); + for (const auto& a: answers) { + nlohmann::json request; + request["result"] = !a.empty(); + for (const auto& p: a) { + nlohmann::json dict_pair; + dict_pair["docid"] = p.doc_id; + dict_pair["rank"] = p.rank; + request["relevance"].push_back(dict_pair); + } + std::string name = "request00" + std::to_string(ind); + answ[name] = request; + ind++; + } + + nlohmann::json file; + file["answers"] = answ; + + answers_file << file; + +} diff --git a/src/InvertedIndex.cpp b/src/InvertedIndex.cpp new file mode 100644 index 0000000..a43c355 --- /dev/null +++ b/src/InvertedIndex.cpp @@ -0,0 +1,98 @@ +// +// Created by ksv on 31.03.2022. +// + +#include "InvertedIndex.h" + +#include +#include +#include "ConverterJSON.h" + +void InvertedIndex::RunThread(size_t docIndex, const std::string &doc) { + + bool isWord = false; + int i = 0; + while (doc[i] == ' ' && doc[i] != '\0'){ + i++; + } + int indexWordStart = 0; + int indexWordEnd = 0; + while (doc[i] != '\0') { + if (doc[i] != ' ' && !isWord) + { + isWord = true; + indexWordStart = i; + } + else if (doc[i] == ' ' || doc[i+1] == '\0'){ + isWord = false; + if(doc[i] == ' ') + indexWordEnd = i; + else + indexWordEnd = i+1; + std::string currentWord = doc.substr(indexWordStart, indexWordEnd-indexWordStart); + std::string result(currentWord.size(), ' '); + std::transform(currentWord.begin(), currentWord.end(), result.begin(), tolower); + currentWord = result; + bool entryFound = false; + mutex->lock(); + if(freq_dictionary.find(currentWord) != freq_dictionary.end()){ + auto ¤tEntry = freq_dictionary[currentWord]; + for (auto ¤t: currentEntry) { + if(current.doc_id == docIndex){ + current.count++; + entryFound = true; + } + } + if(!entryFound){ + auto entry = Entry(); + entry.doc_id = docIndex; + entry.count = 1; + currentEntry.push_back(entry); + } + }else{ + auto currentEntry = std::vector(); + auto entry = Entry(); + entry.doc_id = docIndex; + entry.count = 1; + currentEntry.push_back(entry); + freq_dictionary[currentWord] = currentEntry; + } + mutex->unlock(); + } + i++; + } +} + +void InvertedIndex::UpdateDocumentBase(std::vector input_docs) { + docs = std::move(input_docs); + size_t docIndex = 0; + std::vector threads; + this->mutex = new std::mutex; + auto thread_pool = boost::asio::thread_pool(); + for (const auto& doc: docs) { + boost::asio::post(thread_pool, boost::bind(&InvertedIndex::RunThread, this, docIndex, doc)); + docIndex++; + } + thread_pool.join(); + delete this->mutex; + for(auto& [key, value]: freq_dictionary){ + std::sort(value.begin(), value.end(), + [] (Entry const& a, Entry const& b) { return a.doc_id < b.doc_id; }); + } +} + +std::vector InvertedIndex::GetWordCount(const std::string& word) { + std::string result(word.size(), ' '); + std::transform(word.begin(), word.end(), result.begin(), ::tolower); + if(freq_dictionary.find(word) != freq_dictionary.end()){ + auto entry = freq_dictionary[word]; + return entry; + }else{ + auto entry = Entry(); + entry.doc_id = 0; + entry.count = 0; + auto resp = std::vector(); + resp.push_back(entry); + return resp; + } +} diff --git a/src/SearchServer.cpp b/src/SearchServer.cpp new file mode 100644 index 0000000..07b3edf --- /dev/null +++ b/src/SearchServer.cpp @@ -0,0 +1,117 @@ +// +// Created by ksv on 05.04.2022. +// + +#include +#include +#include +#include +#include +#include "SearchServer.h" +#include "ConverterJSON.h" + +std::vector> SearchServer::search(const std::vector &queries_input) { + + auto answers = std::vector>(); + + for (const auto& request: queries_input) { + + auto words = std::unordered_set(); + + bool isWord = false; + int i = 0; + while (request[i] == ' ' && request[i] != '\0'){ + i++; + } + int indexWordStart = 0; + int indexWordEnd = 0; + while (request[i] != '\0') { + if (request[i] != ' ' && !isWord) + { + isWord = true; + indexWordStart = i; + } + else if (request[i] == ' ' || request[i+1] == '\0'){ + isWord = false; + if(request[i] == ' ') + indexWordEnd = i; + else + indexWordEnd = i+1; + std::string currentWord = request.substr(indexWordStart, indexWordEnd-indexWordStart); + std::string result(currentWord.size(), ' '); + std::transform(currentWord.begin(), currentWord.end(), result.begin(), tolower); + currentWord = result; + words.insert(currentWord); + + } + i++; + } + + std::vector> wordsCount; + + for(const auto& word: words){ + auto freq_dictionary = this->_index.GetWordCount(word); + int sum = 0; + for (const auto& entry: freq_dictionary) { + sum += entry.count; + } + auto wordCount = std::make_pair(word,sum); + wordsCount.push_back(wordCount); + } + + std::sort(wordsCount.begin(), wordsCount.end(), + [] (std::pair const& a, std::pair const& b) { return a.second < b.second; }); + + auto answer = std::vector(); + + if(wordsCount[0].second != 0) { + + auto docs = std::map>>(); + + for (const auto &[key, value]: wordsCount) { + auto freq_dictionary = this->_index.GetWordCount(key); + for(const auto& entry: freq_dictionary){ + + if (docs.count(entry.doc_id) == 0) { + docs[entry.doc_id] = std::vector>(); + } + docs[entry.doc_id].push_back(std::make_pair(key, entry.count)); + + } + } + + float max = 0.0; + auto rel_map = std::map(); + for (const auto& [doc_key, doc_value]: docs) { + rel_map[doc_key] = 0; + for(const auto& [key, value]: doc_value){ + rel_map[doc_key]+= value; + } + if(rel_map[doc_key] > max) max = rel_map[doc_key]; + } + + for(const auto& [key, value]: rel_map){ + auto rel = RelativeIndex(key, (float)value / max); + answer.push_back(rel); + + std::sort(answer.begin(), answer.end(), + [] (RelativeIndex const& a, RelativeIndex const& b) { return a.rank > b.rank; }); + + auto conv = ConverterJSON(); + conv.LoadConfig(); + auto max_responses = conv.cf.config.max_responses; + + while(answer.size() > max_responses){ + answer.pop_back(); + } + + } + + } + + answers.push_back(answer); + + } + + return answers; +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..3451083 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,32 @@ +#include + +#include "ConverterJSON.h" +#include "InvertedIndex.h" +#include "SearchServer.h" + +int main() { + + auto converterJSON = ConverterJSON(); + auto invertedIndex = InvertedIndex(); + + if(!converterJSON.LoadConfig()){ + return 1; + } + + if(!converterJSON.LoadRequests()){ + return 1; + } + + std::cout << "==================================================\n"; + std::cout << converterJSON.cf.config.name << ((converterJSON.cf.config.version.empty())? "" : " version: ") << converterJSON.cf.config.version << std::endl; + std::cout << "==================================================\n"; + + auto texts = converterJSON.GetTextDocuments(); + invertedIndex.UpdateDocumentBase(texts); + SearchServer srv(invertedIndex); + auto result = srv.search(converterJSON.GetRequests()); + converterJSON.PutAnswers(result); + + return 0; + +} diff --git a/tests/tests.cpp b/tests/tests.cpp new file mode 100644 index 0000000..57917f5 --- /dev/null +++ b/tests/tests.cpp @@ -0,0 +1,216 @@ +// +// Created by ksv on 27.03.2022. +// + +#include "gtest/gtest.h" +#include "ConverterJSON.h" +#include "InvertedIndex.h" +#include "SearchServer.h" + +TEST(TestCaseConverterJSON, LoadConfig) +{ + auto converterJSON = ConverterJSON(); + EXPECT_EQ(converterJSON.LoadConfig(), true); +} + +TEST(TestCaseConverterJSON, GetTextDocuments) +{ + auto converterJSON = ConverterJSON(); + converterJSON.LoadConfig(); + auto texts = converterJSON.GetTextDocuments(); + EXPECT_EQ(texts.size(), 3); + EXPECT_EQ(texts[0],"best first file"); + EXPECT_EQ(texts[1],"second file"); + EXPECT_EQ(texts[2],"third file"); +} + +TEST(TestCaseConverterJSON, LoadRequests) +{ + auto converterJSON = ConverterJSON(); + EXPECT_EQ(converterJSON.LoadRequests(), true); + EXPECT_LE(converterJSON.req.requests.size(), 1000); +} + +TEST(TestCaseConverterJSON, GetRequests) +{ + auto converterJSON = ConverterJSON(); + converterJSON.LoadRequests(); + auto requests = converterJSON.GetRequests(); + EXPECT_EQ(requests.size(), 3); + EXPECT_EQ(requests[0],"first file best"); + EXPECT_EQ(requests[1],"what"); + EXPECT_EQ(requests[2],"third"); +} + +TEST(TestCaseConverterJSON, PutAnswer){ + std::string text = "{\"answers\":{\"request001\":{\"relevance\":[{\"docid\":0,\"rank\":0.8999999761581421},{\"docid\":1,\"rank\":0.800000011920929},{\"docid\":2,\"rank\":0.699999988079071}],\"result\":true},\"request002\":{\"relevance\":[{\"docid\":0,\"rank\":0.8999999761581421},{\"docid\":1,\"rank\":0.800000011920929},{\"docid\":2,\"rank\":0.699999988079071}],\"result\":true},\"request003\":{\"relevance\":[{\"docid\":0,\"rank\":0.8999999761581421},{\"docid\":1,\"rank\":0.800000011920929},{\"docid\":2,\"rank\":0.699999988079071}],\"result\":true}}}"; + auto converterJSON = ConverterJSON(); + converterJSON.LoadRequests(); + std::vector> answers; + for (const auto& r: converterJSON.req.requests) { + std::vector answer; + auto pair1 = RelativeIndex(0, 0.9); + answer.push_back(pair1); + auto pair2 = RelativeIndex(1, 0.8); + answer.push_back(pair2); + auto pair3 = RelativeIndex(2, 0.7); + answer.push_back(pair3); + answers.push_back(answer); + } + converterJSON.PutAnswers(answers); + auto file = std::ifstream("../resources/answers.json"); + std::string text_file; + file >> text_file; + EXPECT_EQ(text, text_file); +} + +using namespace std; + +void TestInvertedIndexFunctionality( + const vector& docs, + const vector& requests, + const std::vector>& expected +) { + std::vector> result; + InvertedIndex idx; + + idx.UpdateDocumentBase(docs); + + for(auto& request : requests) { + std::vector word_count = idx.GetWordCount(request); + result.push_back(word_count); + } + + ASSERT_EQ(result, expected); +} + +TEST(TestCaseInvertedIndex, TestBasic) { + const vector docs = { + "london is the capital of great britain", + "big ben is the nickname for the Great bell of the striking clock" + }; + const vector requests = {"london", "the"}; + const vector> expected = { + { + {0, 1} + }, { + {0, 1}, {1, 3} + } + }; + TestInvertedIndexFunctionality(docs, requests, expected); +} + +TEST(TestCaseInvertedIndex, TestBasic2) { + const vector docs = { + "milk milk milk milk water water water", + "milk water water", + "milk milk milk milk milk water water water water water", + "americano cappuccino" + }; + + const vector requests = {"milk", "water", "cappuccino"}; + const vector> expected = { + { + {0, 4}, {1, 1}, {2, 5} + }, { + {0, 3}, {1, 2}, {2, 5} + }, { + {3, 1} + } + }; + TestInvertedIndexFunctionality(docs, requests, expected); +} + +TEST(TestCaseInvertedIndex, TestInvertedIndexMissingWord) { + const vector docs = { + "a b c d e f g h i j k l", + "statement" + }; + const vector requests = {"m", "statement"}; + const vector> expected = { + { + {} + }, { + {1, 1} + } + }; + TestInvertedIndexFunctionality(docs, requests, expected); +} + +TEST(TestCaseSearchServer, TestSimple) { +const vector docs = { + "milk milk milk milk water water water", + "milk water water", + "milk milk milk milk milk water water water water water", + "Americano Cappuccino" +}; + +const vector request = {"milk water", "sugar"}; +const std::vector> expected = { + { + {2, 1}, + {0, 0.7}, + {1, 0.3} + }, + { + {} + } +}; + +InvertedIndex idx; +idx.UpdateDocumentBase(docs); + +SearchServer srv(idx); + +std::vector> result = srv.search(request); + +ASSERT_EQ(result, expected); +} + +TEST(TestCaseSearchServer, TestTop5) { +const vector docs = { + "london is the capital of great britain", + "paris is the capital of france", + "berlin is the capital of germany", + "rome is the capital of italy", + "madrid is the capital of spain", + "lisboa is the capital of portugal", + "bern is the capital of switzerland", + "moscow is the capital of russia", + "kiev is the capital of ukraine", + "minsk is the capital of belarus", + "astana is the capital of kazakhstan", + "beijing is the capital of china", + "tokyo is the capital of japan", + "bangkok is the capital of thailand", + "welcome to moscow the capital of russia the third rome", + "amsterdam is the capital of netherlands", + "helsinki is the capital of finland", + "oslo is the capital of norway", + "stockholm is the capital of sweden", + "riga is the capital of latvia", + "tallinn is the capital of estonia", + "warsaw is the capital of poland", +}; + +const vector request = {"moscow is the capital of russia"}; + +const std::vector> expected = { + { + {7, 1}, + {14, 1}, + {0, 0.666666687}, + {1, 0.666666687}, + {2, 0.666666687} + } +}; + +InvertedIndex idx; +idx.UpdateDocumentBase(docs); + +SearchServer srv(idx); +std::vector> result = srv.search(request); + +ASSERT_EQ(result, expected); +} +