From f1309a1be3824fd12a528e4aed4de12a17ca49b3 Mon Sep 17 00:00:00 2001 From: David Belicza <87.bdavid@gmail.com> Date: Sun, 20 Sep 2020 16:55:12 +0200 Subject: [PATCH 1/5] Prepare the package for refactoring --- .github/workflows/tests.yml | 39 + .gitignore | 4 +- .travis.yml | 11 - LICENSE | 21 + composer.json | 15 +- composer.lock | 1409 ----------------- phpunit.xml | 27 - readme.md | 105 +- src/TextRankFacade.php | 7 - src/Tool/Graph.php | 15 - src/Tool/Parser.php | 14 - src/Tool/Score.php | 14 - src/Tool/StopWords/English.php | 12 - src/Tool/StopWords/French.php | 12 - src/Tool/StopWords/German.php | 15 +- src/Tool/StopWords/Italian.php | 5 - src/Tool/StopWords/Norwegian.php | 12 - src/Tool/StopWords/Russian.php | 13 - src/Tool/StopWords/Spanish.php | 12 - src/Tool/StopWords/StopWordsAbstract.php | 12 - src/Tool/Summarize.php | 14 - src/Tool/Text.php | 14 - tests/{ => functional}/TextRankFacadeTest.php | 11 +- tests/phpunit.xml | 34 + {res => tests/resource}/sample1.txt | 0 25 files changed, 144 insertions(+), 1703 deletions(-) create mode 100644 .github/workflows/tests.yml delete mode 100644 .travis.yml create mode 100644 LICENSE delete mode 100644 composer.lock delete mode 100644 phpunit.xml rename tests/{ => functional}/TextRankFacadeTest.php (93%) create mode 100644 tests/phpunit.xml rename {res => tests/resource}/sample1.txt (100%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..f3f4b41 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,39 @@ +name: tests + +on: + push: + pull_request: + +jobs: + run: + runs-on: ${{ matrix.operating-system }} + strategy: + matrix: + operating-system: [ubuntu-latest] + php-versions: ['7.4'] + name: PHP ${{ matrix.php-versions }} Test on ${{ matrix.operating-system }} + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Setup PHP + uses: shivammathur/setup-php@v2 + with: + php-version: ${{ matrix.php-versions }} + coverage: xdebug + + - name: Validate composer files + run: composer validate + + - name: Install dependencies + if: steps.composer-cache.outputs.cache-hit != 'true' + run: composer install --prefer-dist --no-progress --no-suggest + + - name: Run test suite + run: composer test + + - name: Publish Analysis + uses: codecov/codecov-action@v1.0.13 + with: + name: Code Analysis + directory: ./var/code-coverage/clover/coverage.xml diff --git a/.gitignore b/.gitignore index ab27d1e..fa374aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /.idea -/vendor \ No newline at end of file +/composer.lock +/vendor +/var diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 0895ccb..0000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: php -php: - - '7.1' - - '7.2' - -dist: trusty -sudo: required -group: edge - -before_script: - - composer install \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..29afed1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 PHP-Science + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/composer.json b/composer.json index 4d07d2e..3dc3015 100644 --- a/composer.json +++ b/composer.json @@ -10,19 +10,24 @@ } ], "require": { - "php": "7.*", + "php": "7.4.*", "ext-ctype": "*", - "ext-mbstring": "*" + "ext-mbstring": "*", + "php-science/pagerank": "1.*" }, "require-dev": { - "phpunit/phpunit": "^5.4" + "phpunit/phpunit": "^9" }, "autoload": { "psr-4": { - "PhpScience\\TextRank\\": ["src/", "tests/"] + "PhpScience\\TextRank\\": [ + "src/", + "tests/unit/", + "tests/functional/" + ] } }, "scripts": { - "test": "phpunit --colors='always' $(pwd)/tests" + "test": "vendor/bin/phpunit -c $(pwd)/tests/phpunit.xml --colors='always' --do-not-cache-result" } } diff --git a/composer.lock b/composer.lock deleted file mode 100644 index 65ae386..0000000 --- a/composer.lock +++ /dev/null @@ -1,1409 +0,0 @@ -{ - "_readme": [ - "This file locks the dependencies of your project to a known state", - "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", - "This file is @generated automatically" - ], - "content-hash": "97ff0c108502ab602ba79bf3a251e208", - "packages": [], - "packages-dev": [ - { - "name": "doctrine/instantiator", - "version": "1.1.0", - "source": { - "type": "git", - "url": "https://github.com/doctrine/instantiator.git", - "reference": "185b8868aa9bf7159f5f953ed5afb2d7fcdc3bda" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/doctrine/instantiator/zipball/185b8868aa9bf7159f5f953ed5afb2d7fcdc3bda", - "reference": "185b8868aa9bf7159f5f953ed5afb2d7fcdc3bda", - "shasum": "" - }, - "require": { - "php": "^7.1" - }, - "require-dev": { - "athletic/athletic": "~0.1.8", - "ext-pdo": "*", - "ext-phar": "*", - "phpunit/phpunit": "^6.2.3", - "squizlabs/php_codesniffer": "^3.0.2" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.2.x-dev" - } - }, - "autoload": { - "psr-4": { - "Doctrine\\Instantiator\\": "src/Doctrine/Instantiator/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Marco Pivetta", - "email": "ocramius@gmail.com", - "homepage": "http://ocramius.github.com/" - } - ], - "description": "A small, lightweight utility to instantiate objects in PHP without invoking their constructors", - "homepage": "https://github.com/doctrine/instantiator", - "keywords": [ - "constructor", - "instantiate" - ], - "time": "2017-07-22T11:58:36+00:00" - }, - { - "name": "myclabs/deep-copy", - "version": "1.8.1", - "source": { - "type": "git", - "url": "https://github.com/myclabs/DeepCopy.git", - "reference": "3e01bdad3e18354c3dce54466b7fbe33a9f9f7f8" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/3e01bdad3e18354c3dce54466b7fbe33a9f9f7f8", - "reference": "3e01bdad3e18354c3dce54466b7fbe33a9f9f7f8", - "shasum": "" - }, - "require": { - "php": "^7.1" - }, - "replace": { - "myclabs/deep-copy": "self.version" - }, - "require-dev": { - "doctrine/collections": "^1.0", - "doctrine/common": "^2.6", - "phpunit/phpunit": "^7.1" - }, - "type": "library", - "autoload": { - "psr-4": { - "DeepCopy\\": "src/DeepCopy/" - }, - "files": [ - "src/DeepCopy/deep_copy.php" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "description": "Create deep copies (clones) of your objects", - "keywords": [ - "clone", - "copy", - "duplicate", - "object", - "object graph" - ], - "time": "2018-06-11T23:09:50+00:00" - }, - { - "name": "phpdocumentor/reflection-common", - "version": "1.0.1", - "source": { - "type": "git", - "url": "https://github.com/phpDocumentor/ReflectionCommon.git", - "reference": "21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/phpDocumentor/ReflectionCommon/zipball/21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6", - "reference": "21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6", - "shasum": "" - }, - "require": { - "php": ">=5.5" - }, - "require-dev": { - "phpunit/phpunit": "^4.6" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.0.x-dev" - } - }, - "autoload": { - "psr-4": { - "phpDocumentor\\Reflection\\": [ - "src" - ] - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Jaap van Otterdijk", - "email": "opensource@ijaap.nl" - } - ], - "description": "Common reflection classes used by phpdocumentor to reflect the code structure", - "homepage": "http://www.phpdoc.org", - "keywords": [ - "FQSEN", - "phpDocumentor", - "phpdoc", - "reflection", - "static analysis" - ], - "time": "2017-09-11T18:02:19+00:00" - }, - { - "name": "phpdocumentor/reflection-docblock", - "version": "4.3.0", - "source": { - "type": "git", - "url": "https://github.com/phpDocumentor/ReflectionDocBlock.git", - "reference": "94fd0001232e47129dd3504189fa1c7225010d08" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/94fd0001232e47129dd3504189fa1c7225010d08", - "reference": "94fd0001232e47129dd3504189fa1c7225010d08", - "shasum": "" - }, - "require": { - "php": "^7.0", - "phpdocumentor/reflection-common": "^1.0.0", - "phpdocumentor/type-resolver": "^0.4.0", - "webmozart/assert": "^1.0" - }, - "require-dev": { - "doctrine/instantiator": "~1.0.5", - "mockery/mockery": "^1.0", - "phpunit/phpunit": "^6.4" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "4.x-dev" - } - }, - "autoload": { - "psr-4": { - "phpDocumentor\\Reflection\\": [ - "src/" - ] - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Mike van Riel", - "email": "me@mikevanriel.com" - } - ], - "description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.", - "time": "2017-11-30T07:14:17+00:00" - }, - { - "name": "phpdocumentor/type-resolver", - "version": "0.4.0", - "source": { - "type": "git", - "url": "https://github.com/phpDocumentor/TypeResolver.git", - "reference": "9c977708995954784726e25d0cd1dddf4e65b0f7" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/phpDocumentor/TypeResolver/zipball/9c977708995954784726e25d0cd1dddf4e65b0f7", - "reference": "9c977708995954784726e25d0cd1dddf4e65b0f7", - "shasum": "" - }, - "require": { - "php": "^5.5 || ^7.0", - "phpdocumentor/reflection-common": "^1.0" - }, - "require-dev": { - "mockery/mockery": "^0.9.4", - "phpunit/phpunit": "^5.2||^4.8.24" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.0.x-dev" - } - }, - "autoload": { - "psr-4": { - "phpDocumentor\\Reflection\\": [ - "src/" - ] - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Mike van Riel", - "email": "me@mikevanriel.com" - } - ], - "time": "2017-07-14T14:27:02+00:00" - }, - { - "name": "phpspec/prophecy", - "version": "1.8.0", - "source": { - "type": "git", - "url": "https://github.com/phpspec/prophecy.git", - "reference": "4ba436b55987b4bf311cb7c6ba82aa528aac0a06" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/phpspec/prophecy/zipball/4ba436b55987b4bf311cb7c6ba82aa528aac0a06", - "reference": "4ba436b55987b4bf311cb7c6ba82aa528aac0a06", - "shasum": "" - }, - "require": { - "doctrine/instantiator": "^1.0.2", - "php": "^5.3|^7.0", - "phpdocumentor/reflection-docblock": "^2.0|^3.0.2|^4.0", - "sebastian/comparator": "^1.1|^2.0|^3.0", - "sebastian/recursion-context": "^1.0|^2.0|^3.0" - }, - "require-dev": { - "phpspec/phpspec": "^2.5|^3.2", - "phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.5 || ^7.1" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.8.x-dev" - } - }, - "autoload": { - "psr-0": { - "Prophecy\\": "src/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Konstantin Kudryashov", - "email": "ever.zet@gmail.com", - "homepage": "http://everzet.com" - }, - { - "name": "Marcello Duarte", - "email": "marcello.duarte@gmail.com" - } - ], - "description": "Highly opinionated mocking framework for PHP 5.3+", - "homepage": "https://github.com/phpspec/prophecy", - "keywords": [ - "Double", - "Dummy", - "fake", - "mock", - "spy", - "stub" - ], - "time": "2018-08-05T17:53:17+00:00" - }, - { - "name": "phpunit/php-code-coverage", - "version": "4.0.8", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/php-code-coverage.git", - "reference": "ef7b2f56815df854e66ceaee8ebe9393ae36a40d" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/ef7b2f56815df854e66ceaee8ebe9393ae36a40d", - "reference": "ef7b2f56815df854e66ceaee8ebe9393ae36a40d", - "shasum": "" - }, - "require": { - "ext-dom": "*", - "ext-xmlwriter": "*", - "php": "^5.6 || ^7.0", - "phpunit/php-file-iterator": "^1.3", - "phpunit/php-text-template": "^1.2", - "phpunit/php-token-stream": "^1.4.2 || ^2.0", - "sebastian/code-unit-reverse-lookup": "^1.0", - "sebastian/environment": "^1.3.2 || ^2.0", - "sebastian/version": "^1.0 || ^2.0" - }, - "require-dev": { - "ext-xdebug": "^2.1.4", - "phpunit/phpunit": "^5.7" - }, - "suggest": { - "ext-xdebug": "^2.5.1" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "4.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sb@sebastian-bergmann.de", - "role": "lead" - } - ], - "description": "Library that provides collection, processing, and rendering functionality for PHP code coverage information.", - "homepage": "https://github.com/sebastianbergmann/php-code-coverage", - "keywords": [ - "coverage", - "testing", - "xunit" - ], - "time": "2017-04-02T07:44:40+00:00" - }, - { - "name": "phpunit/php-file-iterator", - "version": "1.4.5", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/php-file-iterator.git", - "reference": "730b01bc3e867237eaac355e06a36b85dd93a8b4" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/php-file-iterator/zipball/730b01bc3e867237eaac355e06a36b85dd93a8b4", - "reference": "730b01bc3e867237eaac355e06a36b85dd93a8b4", - "shasum": "" - }, - "require": { - "php": ">=5.3.3" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.4.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sb@sebastian-bergmann.de", - "role": "lead" - } - ], - "description": "FilterIterator implementation that filters files based on a list of suffixes.", - "homepage": "https://github.com/sebastianbergmann/php-file-iterator/", - "keywords": [ - "filesystem", - "iterator" - ], - "time": "2017-11-27T13:52:08+00:00" - }, - { - "name": "phpunit/php-text-template", - "version": "1.2.1", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/php-text-template.git", - "reference": "31f8b717e51d9a2afca6c9f046f5d69fc27c8686" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/php-text-template/zipball/31f8b717e51d9a2afca6c9f046f5d69fc27c8686", - "reference": "31f8b717e51d9a2afca6c9f046f5d69fc27c8686", - "shasum": "" - }, - "require": { - "php": ">=5.3.3" - }, - "type": "library", - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de", - "role": "lead" - } - ], - "description": "Simple template engine.", - "homepage": "https://github.com/sebastianbergmann/php-text-template/", - "keywords": [ - "template" - ], - "time": "2015-06-21T13:50:34+00:00" - }, - { - "name": "phpunit/php-timer", - "version": "1.0.9", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/php-timer.git", - "reference": "3dcf38ca72b158baf0bc245e9184d3fdffa9c46f" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/php-timer/zipball/3dcf38ca72b158baf0bc245e9184d3fdffa9c46f", - "reference": "3dcf38ca72b158baf0bc245e9184d3fdffa9c46f", - "shasum": "" - }, - "require": { - "php": "^5.3.3 || ^7.0" - }, - "require-dev": { - "phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.0" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.0-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sb@sebastian-bergmann.de", - "role": "lead" - } - ], - "description": "Utility class for timing", - "homepage": "https://github.com/sebastianbergmann/php-timer/", - "keywords": [ - "timer" - ], - "time": "2017-02-26T11:10:40+00:00" - }, - { - "name": "phpunit/php-token-stream", - "version": "2.0.2", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/php-token-stream.git", - "reference": "791198a2c6254db10131eecfe8c06670700904db" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/php-token-stream/zipball/791198a2c6254db10131eecfe8c06670700904db", - "reference": "791198a2c6254db10131eecfe8c06670700904db", - "shasum": "" - }, - "require": { - "ext-tokenizer": "*", - "php": "^7.0" - }, - "require-dev": { - "phpunit/phpunit": "^6.2.4" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.0-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Wrapper around PHP's tokenizer extension.", - "homepage": "https://github.com/sebastianbergmann/php-token-stream/", - "keywords": [ - "tokenizer" - ], - "time": "2017-11-27T05:48:46+00:00" - }, - { - "name": "phpunit/phpunit", - "version": "5.7.27", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/phpunit.git", - "reference": "b7803aeca3ccb99ad0a506fa80b64cd6a56bbc0c" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/b7803aeca3ccb99ad0a506fa80b64cd6a56bbc0c", - "reference": "b7803aeca3ccb99ad0a506fa80b64cd6a56bbc0c", - "shasum": "" - }, - "require": { - "ext-dom": "*", - "ext-json": "*", - "ext-libxml": "*", - "ext-mbstring": "*", - "ext-xml": "*", - "myclabs/deep-copy": "~1.3", - "php": "^5.6 || ^7.0", - "phpspec/prophecy": "^1.6.2", - "phpunit/php-code-coverage": "^4.0.4", - "phpunit/php-file-iterator": "~1.4", - "phpunit/php-text-template": "~1.2", - "phpunit/php-timer": "^1.0.6", - "phpunit/phpunit-mock-objects": "^3.2", - "sebastian/comparator": "^1.2.4", - "sebastian/diff": "^1.4.3", - "sebastian/environment": "^1.3.4 || ^2.0", - "sebastian/exporter": "~2.0", - "sebastian/global-state": "^1.1", - "sebastian/object-enumerator": "~2.0", - "sebastian/resource-operations": "~1.0", - "sebastian/version": "^1.0.6|^2.0.1", - "symfony/yaml": "~2.1|~3.0|~4.0" - }, - "conflict": { - "phpdocumentor/reflection-docblock": "3.0.2" - }, - "require-dev": { - "ext-pdo": "*" - }, - "suggest": { - "ext-xdebug": "*", - "phpunit/php-invoker": "~1.1" - }, - "bin": [ - "phpunit" - ], - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "5.7.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de", - "role": "lead" - } - ], - "description": "The PHP Unit Testing framework.", - "homepage": "https://phpunit.de/", - "keywords": [ - "phpunit", - "testing", - "xunit" - ], - "time": "2018-02-01T05:50:59+00:00" - }, - { - "name": "phpunit/phpunit-mock-objects", - "version": "3.4.4", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/phpunit-mock-objects.git", - "reference": "a23b761686d50a560cc56233b9ecf49597cc9118" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/a23b761686d50a560cc56233b9ecf49597cc9118", - "reference": "a23b761686d50a560cc56233b9ecf49597cc9118", - "shasum": "" - }, - "require": { - "doctrine/instantiator": "^1.0.2", - "php": "^5.6 || ^7.0", - "phpunit/php-text-template": "^1.2", - "sebastian/exporter": "^1.2 || ^2.0" - }, - "conflict": { - "phpunit/phpunit": "<5.4.0" - }, - "require-dev": { - "phpunit/phpunit": "^5.4" - }, - "suggest": { - "ext-soap": "*" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "3.2.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sb@sebastian-bergmann.de", - "role": "lead" - } - ], - "description": "Mock Object library for PHPUnit", - "homepage": "https://github.com/sebastianbergmann/phpunit-mock-objects/", - "keywords": [ - "mock", - "xunit" - ], - "time": "2017-06-30T09:13:00+00:00" - }, - { - "name": "sebastian/code-unit-reverse-lookup", - "version": "1.0.1", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/code-unit-reverse-lookup.git", - "reference": "4419fcdb5eabb9caa61a27c7a1db532a6b55dd18" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/code-unit-reverse-lookup/zipball/4419fcdb5eabb9caa61a27c7a1db532a6b55dd18", - "reference": "4419fcdb5eabb9caa61a27c7a1db532a6b55dd18", - "shasum": "" - }, - "require": { - "php": "^5.6 || ^7.0" - }, - "require-dev": { - "phpunit/phpunit": "^5.7 || ^6.0" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Looks up which function or method a line of code belongs to", - "homepage": "https://github.com/sebastianbergmann/code-unit-reverse-lookup/", - "time": "2017-03-04T06:30:41+00:00" - }, - { - "name": "sebastian/comparator", - "version": "1.2.4", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/comparator.git", - "reference": "2b7424b55f5047b47ac6e5ccb20b2aea4011d9be" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/comparator/zipball/2b7424b55f5047b47ac6e5ccb20b2aea4011d9be", - "reference": "2b7424b55f5047b47ac6e5ccb20b2aea4011d9be", - "shasum": "" - }, - "require": { - "php": ">=5.3.3", - "sebastian/diff": "~1.2", - "sebastian/exporter": "~1.2 || ~2.0" - }, - "require-dev": { - "phpunit/phpunit": "~4.4" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.2.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Jeff Welch", - "email": "whatthejeff@gmail.com" - }, - { - "name": "Volker Dusch", - "email": "github@wallbash.com" - }, - { - "name": "Bernhard Schussek", - "email": "bschussek@2bepublished.at" - }, - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Provides the functionality to compare PHP values for equality", - "homepage": "http://www.github.com/sebastianbergmann/comparator", - "keywords": [ - "comparator", - "compare", - "equality" - ], - "time": "2017-01-29T09:50:25+00:00" - }, - { - "name": "sebastian/diff", - "version": "1.4.3", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/diff.git", - "reference": "7f066a26a962dbe58ddea9f72a4e82874a3975a4" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/diff/zipball/7f066a26a962dbe58ddea9f72a4e82874a3975a4", - "reference": "7f066a26a962dbe58ddea9f72a4e82874a3975a4", - "shasum": "" - }, - "require": { - "php": "^5.3.3 || ^7.0" - }, - "require-dev": { - "phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.0" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.4-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Kore Nordmann", - "email": "mail@kore-nordmann.de" - }, - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Diff implementation", - "homepage": "https://github.com/sebastianbergmann/diff", - "keywords": [ - "diff" - ], - "time": "2017-05-22T07:24:03+00:00" - }, - { - "name": "sebastian/environment", - "version": "2.0.0", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/environment.git", - "reference": "5795ffe5dc5b02460c3e34222fee8cbe245d8fac" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/5795ffe5dc5b02460c3e34222fee8cbe245d8fac", - "reference": "5795ffe5dc5b02460c3e34222fee8cbe245d8fac", - "shasum": "" - }, - "require": { - "php": "^5.6 || ^7.0" - }, - "require-dev": { - "phpunit/phpunit": "^5.0" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Provides functionality to handle HHVM/PHP environments", - "homepage": "http://www.github.com/sebastianbergmann/environment", - "keywords": [ - "Xdebug", - "environment", - "hhvm" - ], - "time": "2016-11-26T07:53:53+00:00" - }, - { - "name": "sebastian/exporter", - "version": "2.0.0", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/exporter.git", - "reference": "ce474bdd1a34744d7ac5d6aad3a46d48d9bac4c4" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/exporter/zipball/ce474bdd1a34744d7ac5d6aad3a46d48d9bac4c4", - "reference": "ce474bdd1a34744d7ac5d6aad3a46d48d9bac4c4", - "shasum": "" - }, - "require": { - "php": ">=5.3.3", - "sebastian/recursion-context": "~2.0" - }, - "require-dev": { - "ext-mbstring": "*", - "phpunit/phpunit": "~4.4" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Jeff Welch", - "email": "whatthejeff@gmail.com" - }, - { - "name": "Volker Dusch", - "email": "github@wallbash.com" - }, - { - "name": "Bernhard Schussek", - "email": "bschussek@2bepublished.at" - }, - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - }, - { - "name": "Adam Harvey", - "email": "aharvey@php.net" - } - ], - "description": "Provides the functionality to export PHP variables for visualization", - "homepage": "http://www.github.com/sebastianbergmann/exporter", - "keywords": [ - "export", - "exporter" - ], - "time": "2016-11-19T08:54:04+00:00" - }, - { - "name": "sebastian/global-state", - "version": "1.1.1", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/global-state.git", - "reference": "bc37d50fea7d017d3d340f230811c9f1d7280af4" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/global-state/zipball/bc37d50fea7d017d3d340f230811c9f1d7280af4", - "reference": "bc37d50fea7d017d3d340f230811c9f1d7280af4", - "shasum": "" - }, - "require": { - "php": ">=5.3.3" - }, - "require-dev": { - "phpunit/phpunit": "~4.2" - }, - "suggest": { - "ext-uopz": "*" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.0-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Snapshotting of global state", - "homepage": "http://www.github.com/sebastianbergmann/global-state", - "keywords": [ - "global state" - ], - "time": "2015-10-12T03:26:01+00:00" - }, - { - "name": "sebastian/object-enumerator", - "version": "2.0.1", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/object-enumerator.git", - "reference": "1311872ac850040a79c3c058bea3e22d0f09cbb7" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/object-enumerator/zipball/1311872ac850040a79c3c058bea3e22d0f09cbb7", - "reference": "1311872ac850040a79c3c058bea3e22d0f09cbb7", - "shasum": "" - }, - "require": { - "php": ">=5.6", - "sebastian/recursion-context": "~2.0" - }, - "require-dev": { - "phpunit/phpunit": "~5" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Traverses array structures and object graphs to enumerate all referenced objects", - "homepage": "https://github.com/sebastianbergmann/object-enumerator/", - "time": "2017-02-18T15:18:39+00:00" - }, - { - "name": "sebastian/recursion-context", - "version": "2.0.0", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/recursion-context.git", - "reference": "2c3ba150cbec723aa057506e73a8d33bdb286c9a" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/recursion-context/zipball/2c3ba150cbec723aa057506e73a8d33bdb286c9a", - "reference": "2c3ba150cbec723aa057506e73a8d33bdb286c9a", - "shasum": "" - }, - "require": { - "php": ">=5.3.3" - }, - "require-dev": { - "phpunit/phpunit": "~4.4" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Jeff Welch", - "email": "whatthejeff@gmail.com" - }, - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - }, - { - "name": "Adam Harvey", - "email": "aharvey@php.net" - } - ], - "description": "Provides functionality to recursively process PHP variables", - "homepage": "http://www.github.com/sebastianbergmann/recursion-context", - "time": "2016-11-19T07:33:16+00:00" - }, - { - "name": "sebastian/resource-operations", - "version": "1.0.0", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/resource-operations.git", - "reference": "ce990bb21759f94aeafd30209e8cfcdfa8bc3f52" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/resource-operations/zipball/ce990bb21759f94aeafd30209e8cfcdfa8bc3f52", - "reference": "ce990bb21759f94aeafd30209e8cfcdfa8bc3f52", - "shasum": "" - }, - "require": { - "php": ">=5.6.0" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de" - } - ], - "description": "Provides a list of PHP built-in functions that operate on resources", - "homepage": "https://www.github.com/sebastianbergmann/resource-operations", - "time": "2015-07-28T20:34:47+00:00" - }, - { - "name": "sebastian/version", - "version": "2.0.1", - "source": { - "type": "git", - "url": "https://github.com/sebastianbergmann/version.git", - "reference": "99732be0ddb3361e16ad77b68ba41efc8e979019" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/version/zipball/99732be0ddb3361e16ad77b68ba41efc8e979019", - "reference": "99732be0ddb3361e16ad77b68ba41efc8e979019", - "shasum": "" - }, - "require": { - "php": ">=5.6" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.0.x-dev" - } - }, - "autoload": { - "classmap": [ - "src/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "authors": [ - { - "name": "Sebastian Bergmann", - "email": "sebastian@phpunit.de", - "role": "lead" - } - ], - "description": "Library that helps with managing the version number of Git-hosted PHP projects", - "homepage": "https://github.com/sebastianbergmann/version", - "time": "2016-10-03T07:35:21+00:00" - }, - { - "name": "symfony/polyfill-ctype", - "version": "v1.9.0", - "source": { - "type": "git", - "url": "https://github.com/symfony/polyfill-ctype.git", - "reference": "e3d826245268269cd66f8326bd8bc066687b4a19" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-ctype/zipball/e3d826245268269cd66f8326bd8bc066687b4a19", - "reference": "e3d826245268269cd66f8326bd8bc066687b4a19", - "shasum": "" - }, - "require": { - "php": ">=5.3.3" - }, - "suggest": { - "ext-ctype": "For best performance" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.9-dev" - } - }, - "autoload": { - "psr-4": { - "Symfony\\Polyfill\\Ctype\\": "" - }, - "files": [ - "bootstrap.php" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Symfony Community", - "homepage": "https://symfony.com/contributors" - }, - { - "name": "Gert de Pagter", - "email": "BackEndTea@gmail.com" - } - ], - "description": "Symfony polyfill for ctype functions", - "homepage": "https://symfony.com", - "keywords": [ - "compatibility", - "ctype", - "polyfill", - "portable" - ], - "time": "2018-08-06T14:22:27+00:00" - }, - { - "name": "symfony/yaml", - "version": "v4.1.3", - "source": { - "type": "git", - "url": "https://github.com/symfony/yaml.git", - "reference": "46bc69aa91fc4ab78a96ce67873a6b0c148fd48c" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/symfony/yaml/zipball/46bc69aa91fc4ab78a96ce67873a6b0c148fd48c", - "reference": "46bc69aa91fc4ab78a96ce67873a6b0c148fd48c", - "shasum": "" - }, - "require": { - "php": "^7.1.3", - "symfony/polyfill-ctype": "~1.8" - }, - "conflict": { - "symfony/console": "<3.4" - }, - "require-dev": { - "symfony/console": "~3.4|~4.0" - }, - "suggest": { - "symfony/console": "For validating YAML files using the lint command" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "4.1-dev" - } - }, - "autoload": { - "psr-4": { - "Symfony\\Component\\Yaml\\": "" - }, - "exclude-from-classmap": [ - "/Tests/" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Fabien Potencier", - "email": "fabien@symfony.com" - }, - { - "name": "Symfony Community", - "homepage": "https://symfony.com/contributors" - } - ], - "description": "Symfony Yaml Component", - "homepage": "https://symfony.com", - "time": "2018-07-26T11:24:31+00:00" - }, - { - "name": "webmozart/assert", - "version": "1.3.0", - "source": { - "type": "git", - "url": "https://github.com/webmozart/assert.git", - "reference": "0df1908962e7a3071564e857d86874dad1ef204a" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/webmozart/assert/zipball/0df1908962e7a3071564e857d86874dad1ef204a", - "reference": "0df1908962e7a3071564e857d86874dad1ef204a", - "shasum": "" - }, - "require": { - "php": "^5.3.3 || ^7.0" - }, - "require-dev": { - "phpunit/phpunit": "^4.6", - "sebastian/version": "^1.0.1" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "1.3-dev" - } - }, - "autoload": { - "psr-4": { - "Webmozart\\Assert\\": "src/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Bernhard Schussek", - "email": "bschussek@gmail.com" - } - ], - "description": "Assertions to validate method input/output with nice error messages.", - "keywords": [ - "assert", - "check", - "validate" - ], - "time": "2018-01-29T19:49:41+00:00" - } - ], - "aliases": [], - "minimum-stability": "stable", - "stability-flags": [], - "prefer-stable": false, - "prefer-lowest": false, - "platform": { - "php": "7.*", - "ext-ctype": "*", - "ext-mbstring": "*" - }, - "platform-dev": [] -} diff --git a/phpunit.xml b/phpunit.xml deleted file mode 100644 index 8c26e40..0000000 --- a/phpunit.xml +++ /dev/null @@ -1,27 +0,0 @@ - - - - - - - ./tests/ - - - - - - src - - - \ No newline at end of file diff --git a/readme.md b/readme.md index fa8fa15..2206659 100644 --- a/readme.md +++ b/readme.md @@ -1,73 +1,32 @@ -# PHP.Science - TextRank - -[![Build Status](https://travis-ci.org/DavidBelicza/PHP-Science-TextRank.svg?branch=master)](https://travis-ci.org/DavidBelicza/PHP-Science-TextRank) -[![Latest Stable Version](https://poser.pugx.org/php-science/textrank/v/stable.svg)](https://packagist.org/packages/php-science/textrank) -[![License](https://img.shields.io/badge/license-MIT-33CCFF.svg)](https://opensource.org/licenses/MIT) -[![composer.lock](https://poser.pugx.org/php-science/textrank/composerlock)](https://packagist.org/packages/php-science/textrank) - -This source code is an implementation of the TextRank algorithm (Automatic summarization) on PHP7 strict mode. It can summarize a text, article for example to a short paragraph. Before it would start the summarizing it removes the junk words what are defined in the Stopwords namespace. It is possible to extend it with another languages. - - - -## Authors, Contributors - -Name | GitHub user ---- | --- -David Belicza | @DavidBelicza -Riccardo Marton | @riccardomarton -Syndesi | @Syndesi -vincentsch | @vincentsch -Andrew Welch | @khalwat -Andrey Astashov | @mvcaaa -Leo Toneff | @bragle - -## TextRank or Automatic summarization -> Automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document. Technologies that can make a coherent summary take into account variables such as length, writing style and syntax. Automatic data summarization is part of machine learning and data mining. The main idea of summarization is to find a representative subset of the data, which contains the information of the entire set. Summarization technologies are used in a large number of sectors in industry today. - Wikipedia - -The algorithm of this implementation is: -* Find sentences, -* Remove stopwords, -* Create integer values by find and count the matching words, -* Change the integer values by the related words' integer values, -* Normalize values to create scores, -* Order by scores - -## Install -``` -composer require php-science/textrank -``` - -## Test -``` -cd project-folder -composer test -``` -or -``` -cd project-folder -phpunit --colors='always' $(pwd)/tests -``` - -## Examples -```php - -use PhpScience\TextRank\Tool\StopWords\English; - -// String contains a long text, see the /res/sample1.txt file. -$text = "Lorem ipsum..."; - -$api = new TextRankFacade(); -// English implementation for stopwords/junk words: -$stopWords = new English(); -$api->setStopWords($stopWords); - -// Array of the most important keywords: -$result = $api->getOnlyKeyWords($text); - -// Array of the sentences from the most important part of the text: -$result = $api->getHighlights($text); - -// Array of the most important sentences from the text: -$result = $api->summarizeTextBasic($text); -``` -More examples: https://github.com/DoveID/PHP-Science-TextRank/blob/master/tests/TextRankFacadeTest.php +

+PHP.Science TextRank +

+ +

+ + + + + + + + + + + + + + + +

+ +

+This source code is an OOP implementation of the TextRank algorithm. +
The minimum required PHP version is 7.4. +
+
+

+ +## About + +v 2.0.0 WIP diff --git a/src/TextRankFacade.php b/src/TextRankFacade.php index 46758d9..008d22a 100644 --- a/src/TextRankFacade.php +++ b/src/TextRankFacade.php @@ -1,11 +1,4 @@ - */ declare(strict_types=1); diff --git a/src/Tool/Graph.php b/src/Tool/Graph.php index 738cde0..06cdbe9 100644 --- a/src/Tool/Graph.php +++ b/src/Tool/Graph.php @@ -1,24 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool; -/** - * Class Graph - * - * This graph store the sentences and their words with the indexes. This graph - * is the full map of the whole text. - * - * @package PhpScience\TextRank\Tool - */ class Graph { /** diff --git a/src/Tool/Parser.php b/src/Tool/Parser.php index fdd1112..76829be 100644 --- a/src/Tool/Parser.php +++ b/src/Tool/Parser.php @@ -1,11 +1,4 @@ - */ declare(strict_types=1); @@ -13,13 +6,6 @@ use PhpScience\TextRank\Tool\StopWords\StopWordsAbstract; -/** - * Class Parser - * - * This class purpose to parse a real text to sentences and array. - * - * @package PhpScience\TextRank\Tool - */ class Parser { /** diff --git a/src/Tool/Score.php b/src/Tool/Score.php index 26c790a..f28b2c3 100644 --- a/src/Tool/Score.php +++ b/src/Tool/Score.php @@ -1,23 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool; -/** - * Class Score - * - * It handles words and assigns weighted numbers to them. - * - * @package PhpScience\TextRank\Tool - */ class Score { /** diff --git a/src/Tool/StopWords/English.php b/src/Tool/StopWords/English.php index 09a0828..4b08b8b 100644 --- a/src/Tool/StopWords/English.php +++ b/src/Tool/StopWords/English.php @@ -1,21 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class English - * - * @package PhpScience\TextRank\Tool\StopWords - */ class English extends StopWordsAbstract { /** diff --git a/src/Tool/StopWords/French.php b/src/Tool/StopWords/French.php index 0e67ad7..9941bdd 100644 --- a/src/Tool/StopWords/French.php +++ b/src/Tool/StopWords/French.php @@ -1,21 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class French - * - * @package PhpScience\TextRank\Tool\StopWords - */ class French extends StopWordsAbstract { /** diff --git a/src/Tool/StopWords/German.php b/src/Tool/StopWords/German.php index 6faf7b0..24130ea 100644 --- a/src/Tool/StopWords/German.php +++ b/src/Tool/StopWords/German.php @@ -1,18 +1,9 @@ - */ + declare(strict_types=1); + namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class German - * - * @package PhpScience\TextRank\Tool\StopWords - */ + class German extends StopWordsAbstract { /** diff --git a/src/Tool/StopWords/Italian.php b/src/Tool/StopWords/Italian.php index 6aa3093..67ca0e9 100644 --- a/src/Tool/StopWords/Italian.php +++ b/src/Tool/StopWords/Italian.php @@ -4,11 +4,6 @@ namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class Italian - * - * @package PhpScience\TextRank\Tool\StopWords - */ class Italian extends StopWordsAbstract { /** diff --git a/src/Tool/StopWords/Norwegian.php b/src/Tool/StopWords/Norwegian.php index 6d1af40..b84ac91 100644 --- a/src/Tool/StopWords/Norwegian.php +++ b/src/Tool/StopWords/Norwegian.php @@ -1,21 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class Norwegian - * - * @package PhpScience\TextRank\Tool\StopWords - */ class Norwegian extends StopWordsAbstract { /** diff --git a/src/Tool/StopWords/Russian.php b/src/Tool/StopWords/Russian.php index 82edf0f..a9ad80f 100644 --- a/src/Tool/StopWords/Russian.php +++ b/src/Tool/StopWords/Russian.php @@ -1,22 +1,9 @@ - * @author Andrey Astashov (Russian StopWords) - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class Russian - * - * @package PhpScience\TextRank\Tool\StopWords - */ class Russian extends StopWordsAbstract { /** diff --git a/src/Tool/StopWords/Spanish.php b/src/Tool/StopWords/Spanish.php index f7cdffd..325a2c7 100644 --- a/src/Tool/StopWords/Spanish.php +++ b/src/Tool/StopWords/Spanish.php @@ -1,21 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class Spanish - * - * @package PhpScience\TextRank\Tool\StopWords - */ class Spanish extends StopWordsAbstract { /** diff --git a/src/Tool/StopWords/StopWordsAbstract.php b/src/Tool/StopWords/StopWordsAbstract.php index 33b2128..87d46a5 100644 --- a/src/Tool/StopWords/StopWordsAbstract.php +++ b/src/Tool/StopWords/StopWordsAbstract.php @@ -1,21 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool\StopWords; -/** - * Class StopWordsAbstract - * - * @package PhpScience\TextRank\Tool\StopWords - */ abstract class StopWordsAbstract { /** diff --git a/src/Tool/Summarize.php b/src/Tool/Summarize.php index f63c0d0..fd4fed4 100644 --- a/src/Tool/Summarize.php +++ b/src/Tool/Summarize.php @@ -1,23 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool; -/** - * Class Summarize - * - * This is for summarize the text from parsed data. - * - * @package PhpScience\TextRank\Tool - */ class Summarize { /** diff --git a/src/Tool/Text.php b/src/Tool/Text.php index 037cacf..458fc04 100644 --- a/src/Tool/Text.php +++ b/src/Tool/Text.php @@ -1,23 +1,9 @@ - */ declare(strict_types=1); namespace PhpScience\TextRank\Tool; -/** - * Class Text - * - * This class is for store the parsed texts. - * - * @package PhpScience\TextRank\Tool - */ class Text { /** diff --git a/tests/TextRankFacadeTest.php b/tests/functional/TextRankFacadeTest.php similarity index 93% rename from tests/TextRankFacadeTest.php rename to tests/functional/TextRankFacadeTest.php index d1ef4e5..41151ac 100644 --- a/tests/TextRankFacadeTest.php +++ b/tests/functional/TextRankFacadeTest.php @@ -1,11 +1,4 @@ - */ declare(strict_types=1); @@ -19,11 +12,11 @@ class TextRankFacadeTest extends \PHPUnit\Framework\TestCase { protected $sampleText1; - public function setUp() + public function setUp(): void { parent::setUp(); - $path = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'res' + $path = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'resource' . DIRECTORY_SEPARATOR . 'sample1.txt'; $file = fopen($path, 'r'); diff --git a/tests/phpunit.xml b/tests/phpunit.xml new file mode 100644 index 0000000..dca7fc5 --- /dev/null +++ b/tests/phpunit.xml @@ -0,0 +1,34 @@ + + + + + + ../tests/unit + + + ../tests/functional + + + + + + + ../src + + + + + + + + + \ No newline at end of file diff --git a/res/sample1.txt b/tests/resource/sample1.txt similarity index 100% rename from res/sample1.txt rename to tests/resource/sample1.txt From 40e40d97a3817a612a0fc17e5ace82c2264d390f Mon Sep 17 00:00:00 2001 From: David Belicza <87.bdavid@gmail.com> Date: Tue, 22 Sep 2020 19:19:12 +0200 Subject: [PATCH 2/5] Reimplementing the parsing/ranking logic --- .gitignore | 1 + src/Builder/PageRankDataSourceBuilder.php | 47 +++++++++++ src/Builder/TextBuilder.php | 47 +++++++++++ src/Builder/TextBuilderInterface.php | 12 +++ src/Builder/TextRankOutputBuilder.php | 10 +++ .../TextRankOutputBuilderInterface.php | 10 +++ src/Data/Text.php | 31 +++++++ src/Data/Text/Sentence.php | 30 +++++++ src/Data/Text/SentenceInterface.php | 22 +++++ src/Data/Text/TokenMap.php | 25 ++++++ src/Data/Text/TokenMapInterface.php | 17 ++++ src/Data/TextInterface.php | 18 +++++ src/Data/TextRankOutput.php | 10 +++ src/Data/TextRankOutputInterface.php | 10 +++ src/Facade/TextRank.php | 50 ++++++++++++ src/Service/Parser.php | 53 ++++++++++++ src/Strategy/PageRankStrategy.php | 81 +++++++++++++++++++ .../RankingAlgorithmStrategyInterface.php | 13 +++ 18 files changed, 487 insertions(+) create mode 100644 src/Builder/PageRankDataSourceBuilder.php create mode 100644 src/Builder/TextBuilder.php create mode 100644 src/Builder/TextBuilderInterface.php create mode 100644 src/Builder/TextRankOutputBuilder.php create mode 100644 src/Builder/TextRankOutputBuilderInterface.php create mode 100644 src/Data/Text.php create mode 100644 src/Data/Text/Sentence.php create mode 100644 src/Data/Text/SentenceInterface.php create mode 100644 src/Data/Text/TokenMap.php create mode 100644 src/Data/Text/TokenMapInterface.php create mode 100644 src/Data/TextInterface.php create mode 100644 src/Data/TextRankOutput.php create mode 100644 src/Data/TextRankOutputInterface.php create mode 100644 src/Facade/TextRank.php create mode 100644 src/Service/Parser.php create mode 100644 src/Strategy/PageRankStrategy.php create mode 100644 src/Strategy/RankingAlgorithmStrategyInterface.php diff --git a/.gitignore b/.gitignore index fa374aa..af0e4c2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /.idea +/.tmp /composer.lock /vendor /var diff --git a/src/Builder/PageRankDataSourceBuilder.php b/src/Builder/PageRankDataSourceBuilder.php new file mode 100644 index 0000000..fd08c28 --- /dev/null +++ b/src/Builder/PageRankDataSourceBuilder.php @@ -0,0 +1,47 @@ +getSentences() as $sentence) { + foreach ($sentence->getVector() as $index => $tokenId) { + if (!isset($dataSource[$tokenId])) { + $dataSource[$tokenId] = [ + self::ID => $tokenId, + self::LEFT => [], + self::RIGHT => [] + ]; + } + + if ($sentence->isIndexExists($index - 1)) { + $previousTokenId = $sentence->getTokenId($index - 1); + if ($text->getTokenMap()->isExists($previousTokenId)) { + $dataSource[$tokenId][self::LEFT][] = $previousTokenId; + } + } + + if ($sentence->isIndexExists($index + 1)) { + $nextTokenId = $sentence->getTokenId($index + 1); + if ($text->getTokenMap()->isExists($nextTokenId)) { + $dataSource[$tokenId][self::RIGHT][] = $nextTokenId; + } + } + } + } + + return $dataSource; + } +} diff --git a/src/Builder/TextBuilder.php b/src/Builder/TextBuilder.php new file mode 100644 index 0000000..78dae18 --- /dev/null +++ b/src/Builder/TextBuilder.php @@ -0,0 +1,47 @@ + $sentenceTokenList) { + $sentenceVector = []; + foreach ($sentenceTokenList as $token) { + if (!isset($tokens[$token])) { + $tokens[$token] = $i; + $tokenId = $i; + $i++; + } else { + $tokenId = $tokens[$token]; + } + + $sentenceVector[] = $tokenId; + } + + $sentence = new Sentence(); + $sentence->setVector($sentenceVector); + $sentences[] = $sentence; + } + + $tokenMap = new TokenMap(); + $tokenMap->setTokenMap(array_flip($tokens)); + + return new Text( + $tokenMap, + $sentences + ); + } +} diff --git a/src/Builder/TextBuilderInterface.php b/src/Builder/TextBuilderInterface.php new file mode 100644 index 0000000..1f7d47f --- /dev/null +++ b/src/Builder/TextBuilderInterface.php @@ -0,0 +1,12 @@ +tokenMap = $tokenMap; + $this->sentences = $sentences; + } + + public function getTokenMap(): TokenMapInterface + { + return $this->tokenMap; + } + + public function getSentences(): array + { + return $this->sentences; + } +} diff --git a/src/Data/Text/Sentence.php b/src/Data/Text/Sentence.php new file mode 100644 index 0000000..6acd5e8 --- /dev/null +++ b/src/Data/Text/Sentence.php @@ -0,0 +1,30 @@ +vector = $vector; + } + + public function getVector(): array + { + return $this->vector; + } + + public function isIndexExists(int $index): bool + { + return isset($this->vector[$index]); + } + + public function getTokenId(int $index): int + { + return $this->vector[$index]; + } +} diff --git a/src/Data/Text/SentenceInterface.php b/src/Data/Text/SentenceInterface.php new file mode 100644 index 0000000..59c3d48 --- /dev/null +++ b/src/Data/Text/SentenceInterface.php @@ -0,0 +1,22 @@ +tokenMap = $tokenMap; + } + + public function isExists(int $tokenId): bool + { + return isset($this->tokenMap[$tokenId]); + } + + public function getToken(int $tokenId): string + { + return $this->tokenMap[$tokenId]; + } +} diff --git a/src/Data/Text/TokenMapInterface.php b/src/Data/Text/TokenMapInterface.php new file mode 100644 index 0000000..bf54db0 --- /dev/null +++ b/src/Data/Text/TokenMapInterface.php @@ -0,0 +1,17 @@ +parse($rawText); + + $pageRankStrategy = new PageRankStrategy( + new PageRankDataSourceBuilder() + ); + + $nodeCollection = $pageRankStrategy->rank($text); + + echo PHP_EOL; + + $i = 0; + $nodes = []; + + foreach ($nodeCollection->getNodes() as $node) { + $nodes[] = $node; + $i++; + + if ($i === $maxKeywords) { + break; + } + + /*echo $text->getTokenMap()->getToken($node->getId()); + echo ' - '; + echo $node->getRank(); + echo PHP_EOL;*/ + } + + return $nodes; + } +} diff --git a/src/Service/Parser.php b/src/Service/Parser.php new file mode 100644 index 0000000..35e0913 --- /dev/null +++ b/src/Service/Parser.php @@ -0,0 +1,53 @@ +textBuilder = $textBuilder; + } + + public function parse(string $rawText): TextInterface + { + $sentences = preg_split( + '/(\n+)|(\.\s|\?\s|\!\s)(?![^\(]*\))/', + $rawText, + -1, + PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE + ); + + $textMap = []; + + foreach ($sentences as $sentenceIndex => $sentence) { + $tokens = preg_split( + '/(?:(^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/', + $sentence, + -1, + PREG_SPLIT_NO_EMPTY + ); + + foreach ($tokens as $tokenIndex => $token) { + $tokens[$tokenIndex] = mb_strtolower(trim($token)); + } + + //@todo stopwords + + $textMap[$sentenceIndex] = $tokens; + } + + return $this->textBuilder->build( + $sentences, + $textMap + ); + } +} diff --git a/src/Strategy/PageRankStrategy.php b/src/Strategy/PageRankStrategy.php new file mode 100644 index 0000000..a831142 --- /dev/null +++ b/src/Strategy/PageRankStrategy.php @@ -0,0 +1,81 @@ +pageRankDataSourceBuilder = $pageRankDataSourceBuilder; + } + + public function rank(TextInterface $text): NodeCollectionInterface + { + $dataSource = $this->pageRankDataSourceBuilder->build($text); + $strategy = $this->createPageRankStrategy($dataSource); + $ranking = $this->createRanking($strategy); + $pageRankAlgorithm = $this->createPageRankAlgorithm($ranking, $strategy); + $maxIteration = 100; + + return $pageRankAlgorithm->run($maxIteration); + } + + private function createPageRankAlgorithm( + RankingInterface $ranking, + NodeDataSourceStrategyInterface $strategy + ): PageRankAlgorithmInterface { + + $normalizer = new Normalizer(); + + return new PageRankAlgorithm( + $ranking, + $strategy, + $normalizer + ); + } + + private function createPageRankStrategy( + array $dataSource + ): NodeDataSourceStrategyInterface { + + $nodeBuilder = new NodeBuilder(); + $nodeCollectionBuilder = new NodeCollectionBuilder(); + + return new MemorySourceStrategy( + $nodeBuilder, + $nodeCollectionBuilder, + $dataSource + ); + } + + private function createRanking( + NodeDataSourceStrategyInterface $strategy + ): RankingInterface { + + $rankComparator = new RankComparator(); + + return new Ranking( + $rankComparator, + $strategy + ); + } +} diff --git a/src/Strategy/RankingAlgorithmStrategyInterface.php b/src/Strategy/RankingAlgorithmStrategyInterface.php new file mode 100644 index 0000000..3387864 --- /dev/null +++ b/src/Strategy/RankingAlgorithmStrategyInterface.php @@ -0,0 +1,13 @@ + Date: Wed, 23 Sep 2020 21:53:37 +0200 Subject: [PATCH 3/5] Refactoring facade class --- src/Builder/TextBuilder.php | 10 +- src/Builder/TextRankOutputBuilder.php | 52 ++- .../TextRankOutputBuilderInterface.php | 10 +- src/Data/StopWordCollection.php | 329 ++++++++++++++++++ src/Data/Text/Sentence.php | 24 +- src/Data/Text/SentenceInterface.php | 8 + src/Data/TextRankOutput.php | 23 +- src/Data/TextRankOutput/OutputValue.php | 42 +++ .../TextRankOutput/OutputValueInterface.php | 20 ++ src/Data/TextRankOutputInterface.php | 21 ++ src/Facade/TextRank.php | 62 ++-- src/Service/GetTopNodes.php | 30 ++ src/Service/GetTopNodesInterface.php | 22 ++ src/Service/Parser.php | 15 +- src/Service/SentenceWeighting.php | 45 +++ src/Service/SentenceWeightingInterface.php | 19 + src/Service/StopWordFilter.php | 23 ++ src/Strategy/PageRankStrategy.php | 2 +- tests/resource/sample1.txt | 1 + 19 files changed, 713 insertions(+), 45 deletions(-) create mode 100644 src/Data/StopWordCollection.php create mode 100644 src/Data/TextRankOutput/OutputValue.php create mode 100644 src/Data/TextRankOutput/OutputValueInterface.php create mode 100644 src/Service/GetTopNodes.php create mode 100644 src/Service/GetTopNodesInterface.php create mode 100644 src/Service/SentenceWeighting.php create mode 100644 src/Service/SentenceWeightingInterface.php create mode 100644 src/Service/StopWordFilter.php diff --git a/src/Builder/TextBuilder.php b/src/Builder/TextBuilder.php index 78dae18..fdccf6e 100644 --- a/src/Builder/TextBuilder.php +++ b/src/Builder/TextBuilder.php @@ -20,6 +20,7 @@ public function build(array $originalSentences, array $textMap): TextInterface foreach ($textMap as $sentenceIndex => $sentenceTokenList) { $sentenceVector = []; foreach ($sentenceTokenList as $token) { + $token = (string)$token; if (!isset($tokens[$token])) { $tokens[$token] = $i; $tokenId = $i; @@ -32,12 +33,19 @@ public function build(array $originalSentences, array $textMap): TextInterface } $sentence = new Sentence(); + $sentence->setId($sentenceIndex); $sentence->setVector($sentenceVector); + $sentence->setOriginalValue($originalSentences[$sentenceIndex]); $sentences[] = $sentence; } $tokenMap = new TokenMap(); - $tokenMap->setTokenMap(array_flip($tokens)); + $tokenMap->setTokenMap( + array_map( + 'strval', + array_flip($tokens) + ) + ); return new Text( $tokenMap, diff --git a/src/Builder/TextRankOutputBuilder.php b/src/Builder/TextRankOutputBuilder.php index b4b8590..2155724 100644 --- a/src/Builder/TextRankOutputBuilder.php +++ b/src/Builder/TextRankOutputBuilder.php @@ -4,7 +4,57 @@ namespace PhpScience\TextRank\Builder; -class TextRankOutputBuilder +use PhpScience\PageRank\Data\NodeCollectionInterface; +use PhpScience\TextRank\Data\TextInterface; +use PhpScience\TextRank\Data\TextRankOutput; +use PhpScience\TextRank\Data\TextRankOutput\OutputValue; +use PhpScience\TextRank\Data\TextRankOutputInterface; +use PhpScience\TextRank\Service\GetTopNodesInterface; +use PhpScience\TextRank\Service\SentenceWeightingInterface; + +class TextRankOutputBuilder implements TextRankOutputBuilderInterface { + private GetTopNodesInterface $getTopNodes; + private SentenceWeightingInterface $sentenceWeighting; + + public function __construct( + GetTopNodesInterface $getTopNodes, + SentenceWeightingInterface $sentenceWeighting + ) { + $this->getTopNodes = $getTopNodes; + $this->sentenceWeighting = $sentenceWeighting; + } + + public function build( + TextInterface $text, + NodeCollectionInterface $nodeCollection, + int $maxKeywords + ): TextRankOutputInterface { + $nodes = $this->getTopNodes->execute($nodeCollection, $maxKeywords); + $words = []; + + foreach ($nodes as $node) { + $token = $text + ->getTokenMap() + ->getToken($node->getId()); + $word = new OutputValue(); + $word->setId($node->getId()); + $word->setValue($token); + $word->setRank($node->getRank()); + + $words[] = $word; + } + + $textRankOutput = new TextRankOutput(); + $textRankOutput->setKeyWords(array_slice($words, 0, $maxKeywords)); + + $sentences = $this->sentenceWeighting->weight($text, $words); + + + + $textRankOutput->setSentences($sentences); + + return $textRankOutput; + } } diff --git a/src/Builder/TextRankOutputBuilderInterface.php b/src/Builder/TextRankOutputBuilderInterface.php index ad5b33f..4c42fc6 100644 --- a/src/Builder/TextRankOutputBuilderInterface.php +++ b/src/Builder/TextRankOutputBuilderInterface.php @@ -4,7 +4,15 @@ namespace PhpScience\TextRank\Builder; +use PhpScience\PageRank\Data\NodeCollectionInterface; +use PhpScience\TextRank\Data\TextInterface; +use PhpScience\TextRank\Data\TextRankOutputInterface; + interface TextRankOutputBuilderInterface { - public function build(); + public function build( + TextInterface $text, + NodeCollectionInterface $nodeCollection, + int $maxKeywords + ): TextRankOutputInterface; } diff --git a/src/Data/StopWordCollection.php b/src/Data/StopWordCollection.php new file mode 100644 index 0000000..106bb24 --- /dev/null +++ b/src/Data/StopWordCollection.php @@ -0,0 +1,329 @@ +id = $id; + } + + public function getId(): int + { + return $this->id; + } + + public function setOriginalValue(string $originalValue): void + { + $this->originalValue = $originalValue; + } + + public function getOriginalValue(): string + { + return $this->originalValue; + } public function setVector(array $vector): void { diff --git a/src/Data/Text/SentenceInterface.php b/src/Data/Text/SentenceInterface.php index 59c3d48..d8a5b17 100644 --- a/src/Data/Text/SentenceInterface.php +++ b/src/Data/Text/SentenceInterface.php @@ -6,6 +6,14 @@ interface SentenceInterface { + public function setId(int $id): void; + + public function getId(): int; + + public function setOriginalValue(string $originalValue): void; + + public function getOriginalValue(): string; + /** * @param int[] $vector */ diff --git a/src/Data/TextRankOutput.php b/src/Data/TextRankOutput.php index b4dcec2..ce47e08 100644 --- a/src/Data/TextRankOutput.php +++ b/src/Data/TextRankOutput.php @@ -4,7 +4,28 @@ namespace PhpScience\TextRank\Data; -class TextRankOutput +class TextRankOutput implements TextRankOutputInterface { + private ?array $keyWords; + private ?array $sentences; + public function setKeyWords(array $keywords): void + { + $this->keyWords = $keywords; + } + + public function getKeyWords(): ?array + { + return $this->keyWords; + } + + public function setSentences(array $sentences): void + { + $this->sentences = $sentences; + } + + public function getSentences(): ?array + { + return $this->sentences; + } } diff --git a/src/Data/TextRankOutput/OutputValue.php b/src/Data/TextRankOutput/OutputValue.php new file mode 100644 index 0000000..bd12f81 --- /dev/null +++ b/src/Data/TextRankOutput/OutputValue.php @@ -0,0 +1,42 @@ +id = $id; + } + + public function getId(): int + { + return $this->id; + } + + public function setValue(string $value): void + { + $this->value = $value; + } + + public function getValue(): string + { + return $this->value; + } + + public function setRank(float $rank): void + { + $this->rank = $rank; + } + + public function getRank(): float + { + return $this->rank; + } +} diff --git a/src/Data/TextRankOutput/OutputValueInterface.php b/src/Data/TextRankOutput/OutputValueInterface.php new file mode 100644 index 0000000..e3458d3 --- /dev/null +++ b/src/Data/TextRankOutput/OutputValueInterface.php @@ -0,0 +1,20 @@ +parse($rawText); - - $pageRankStrategy = new PageRankStrategy( - new PageRankDataSourceBuilder() - ); - - $nodeCollection = $pageRankStrategy->rank($text); - - echo PHP_EOL; - - $i = 0; - $nodes = []; - - foreach ($nodeCollection->getNodes() as $node) { - $nodes[] = $node; - $i++; + private Parser $parser; + private RankingAlgorithmStrategyInterface $rankingAlgorithmStrategy; + private TextRankOutputBuilderInterface $textRankOutputBuilder; + + public function __construct( + Parser $parser, + RankingAlgorithmStrategyInterface $rankingAlgorithmStrategy, + TextRankOutputBuilderInterface $textRankOutputBuilder + ) { + $this->parser = $parser; + $this->rankingAlgorithmStrategy = $rankingAlgorithmStrategy; + $this->textRankOutputBuilder = $textRankOutputBuilder; + } - if ($i === $maxKeywords) { - break; - } + public function getKeywords( + string $rawText, + int $maxKeywords + ): TextRankOutputInterface { - /*echo $text->getTokenMap()->getToken($node->getId()); - echo ' - '; - echo $node->getRank(); - echo PHP_EOL;*/ - } + $text = $this->parser->parse($rawText); + $nodeCollection = $this->rankingAlgorithmStrategy->rank($text); - return $nodes; + return $this->textRankOutputBuilder->build( + $text, + $nodeCollection, + $maxKeywords + ); } } diff --git a/src/Service/GetTopNodes.php b/src/Service/GetTopNodes.php new file mode 100644 index 0000000..1b201d2 --- /dev/null +++ b/src/Service/GetTopNodes.php @@ -0,0 +1,30 @@ +getNodes()); + $size = count($nodes); + + for ($i = 0; $i < $size; $i++) { + for ($j = 0; $j < $size; $j++) { + if ($nodes[$i]->getRank() > $nodes[$j]->getRank()) { + $tmp = $nodes[$i]; + $nodes[$i] = $nodes[$j]; + $nodes[$j] = $tmp; + } + } + } + + return $nodes; + } +} diff --git a/src/Service/GetTopNodesInterface.php b/src/Service/GetTopNodesInterface.php new file mode 100644 index 0000000..6e257bc --- /dev/null +++ b/src/Service/GetTopNodesInterface.php @@ -0,0 +1,22 @@ +textBuilder = $textBuilder; + $this->stopWordFilter = $stopWordFilter; } public function parse(string $rawText): TextInterface @@ -37,10 +40,14 @@ public function parse(string $rawText): TextInterface ); foreach ($tokens as $tokenIndex => $token) { - $tokens[$tokenIndex] = mb_strtolower(trim($token)); - } + $token = mb_strtolower(trim($token)); - //@todo stopwords + if ($this->stopWordFilter->isStopWord($token)) { + unset($tokens[$tokenIndex]); + } else { + $tokens[$tokenIndex] = mb_strtolower(trim($token)); + } + } $textMap[$sentenceIndex] = $tokens; } diff --git a/src/Service/SentenceWeighting.php b/src/Service/SentenceWeighting.php new file mode 100644 index 0000000..69b4056 --- /dev/null +++ b/src/Service/SentenceWeighting.php @@ -0,0 +1,45 @@ +getId()] = $keyword->getRank(); + } + + $sentenceOutputList = []; + + foreach ($text->getSentences() as $sentence) { + $vector = $sentence->getVector(); + $score = .0; + + foreach ($vector as $tokenId) { + if (isset($keywordRankMap[$tokenId])) { + $score += $keywordRankMap[$tokenId]; + } + } + + $score = $score / count($vector); + $sentenceOutput = new OutputValue(); + $sentenceOutput->setId($sentence->getId()); + $sentenceOutput->setRank($score); + $sentenceOutput->setValue($sentence->getOriginalValue()); + + $sentenceOutputList[] = $sentenceOutput; + } + + return $sentenceOutputList; + } +} diff --git a/src/Service/SentenceWeightingInterface.php b/src/Service/SentenceWeightingInterface.php new file mode 100644 index 0000000..7bd29c5 --- /dev/null +++ b/src/Service/SentenceWeightingInterface.php @@ -0,0 +1,19 @@ +stopWordCollection = $stopWordCollection; + } + + public function isStopWord(string $word): bool + { + return array_search($word, $this->stopWordCollection->words) !== false; + } +} diff --git a/src/Strategy/PageRankStrategy.php b/src/Strategy/PageRankStrategy.php index a831142..20bab66 100644 --- a/src/Strategy/PageRankStrategy.php +++ b/src/Strategy/PageRankStrategy.php @@ -34,7 +34,7 @@ public function rank(TextInterface $text): NodeCollectionInterface $strategy = $this->createPageRankStrategy($dataSource); $ranking = $this->createRanking($strategy); $pageRankAlgorithm = $this->createPageRankAlgorithm($ranking, $strategy); - $maxIteration = 100; + $maxIteration = 1000; return $pageRankAlgorithm->run($maxIteration); } diff --git a/tests/resource/sample1.txt b/tests/resource/sample1.txt index 849e2a3..bab87cf 100644 --- a/tests/resource/sample1.txt +++ b/tests/resource/sample1.txt @@ -1 +1,2 @@ Over the past fortnight we asked you to nominate your top extensions for the GNOME desktop. And you did just that. Having now sifted through the hundreds of entries, we’re ready to reveal your favourite GNOME Shell extensions. GNOME 3 (which is more commonly used with the GNOME Shell) has an extension framework that lets developers (and users) extend, build on, and shape how the desktop looks, acts and functions. Dash to Dock takes the GNOME Dash — this is the ‘favourites bar’ that appears on the left-hand side of the screen in the Activities overlay — and transforms it into a desktop dock. And just like Plank, Docky or AWN you can add app launchers, rearrange them, and use them to minimise, restore and switch between app windows. Dash to Dock has many of the common “Dock” features you’d expect, including autohide and intellihide, a fixed-width mode, adjustable icon size, and custom themes. My biggest pet peeve with GNOME Shell is its legacy app tray that hides in the bottom left of the screen. All extraneous non-system applets, indicators and tray icons hide down here. This makes it a little harder to use applications that rely on a system tray presence, like Skype, Franz, Telegram, and Dropbox. TopIcons Plus is the quick way to put GNOME system tray icons back where they belong: on show and in reach. The extension moves legacy tray icons from the bottom left of Gnome Shell to the right-hand side of the top panel. A well-stocked settings panel lets you adjust icon opacity, color, padding, size and tray position. Dive into the settings to adjust the sizing, styling and positioning of icons. Like the popular daily stimulant of choice, the Caffeine GNOME extension keeps your computer awake. It couldn’t be simpler to use: just click the empty mug icon. An empty cup means you’re using normal auto suspend rules – e.g., a screensaver – while a freshly brewed cup of coffee means auto suspend and screensaver are turned off. The Caffeine GNOME extension supports GNOME Shell 3.4 or later. Familiar with applications like Guake and Tilda? If so, you’ll instantly see the appeal of the (superbly named) Drop Down Terminal GNOME extension. When installed just tap the key above the tab key (though it can be changed to almost any key you wish) to get instant access to the command line. Want to speed up using workspaces? This simple tool lets you do just that. Once installed you can quickly switch between workspaces by scrolling over the top panel - no need to enter the Activities Overlay! +t \ No newline at end of file From 8c776bb432dbc1824a903040e2ece5bfa19148a9 Mon Sep 17 00:00:00 2001 From: David Belicza <87.bdavid@gmail.com> Date: Sat, 26 Sep 2020 15:32:03 +0200 Subject: [PATCH 4/5] Weighting sentences, added stopword csv reader, added request object --- .gitignore | 1 + resource/stop-word/english.csv | 318 ++++++++ resource/stop-word/french.csv | 689 +++++++++++++++++ resource/stop-word/german.csv | 598 +++++++++++++++ resource/stop-word/italian.csv | 660 ++++++++++++++++ resource/stop-word/norwegian.csv | 221 ++++++ resource/stop-word/russian.csv | 559 ++++++++++++++ resource/stop-word/spanish.csv | 721 ++++++++++++++++++ src/Builder/AlgorithmOutputBuilder.php | 64 ++ ...hp => AlgorithmOutputBuilderInterface.php} | 38 +- src/Builder/PageRankDataSourceBuilder.php | 94 +-- src/Builder/StopWordCollectionBuilder.php | 31 + .../StopWordCollectionBuilderInterface.php | 12 + src/Builder/TextBuilder.php | 110 +-- src/Builder/TextBuilderInterface.php | 24 +- src/Builder/TextRankOutputBuilder.php | 60 -- ...TextRankOutput.php => AlgorithmOutput.php} | 62 +- ...rface.php => AlgorithmOutputInterface.php} | 60 +- src/Data/AlgorithmRequest.php | 76 ++ src/Data/AlgorithmRequestInterface.php | 58 ++ .../OutputValue.php => RankDataObject.php} | 84 +- ...erface.php => RankDataObjectInterface.php} | 40 +- src/Data/StopWordCollection.php | 349 +-------- src/Data/StopWordCollectionInterface.php | 10 + src/Data/Text.php | 62 +- src/Data/Text/Sentence.php | 104 +-- src/Data/Text/SentenceInterface.php | 60 +- src/Data/Text/TokenMap.php | 50 +- src/Data/Text/TokenMapInterface.php | 34 +- src/Data/TextInterface.php | 36 +- src/Exception/IoException.php | 10 + src/Exception/TextRankException.php | 12 + src/Facade/TextRank.php | 100 ++- src/Factory/GeneralFactory.php | 59 ++ src/Factory/GeneralFactoryInterface.php | 21 + src/Service/CsvReader.php | 33 + src/Service/GetTopNodes.php | 30 - src/Service/GetTopNodesInterface.php | 22 - src/Service/Parser.php | 125 +-- src/Service/ParserInterface.php | 12 + src/Service/ReaderInterface.php | 12 + src/Service/SentenceWeighting.php | 112 +-- src/Service/SentenceWeightingInterface.php | 42 +- src/Service/SortRankDataList.php | 28 + src/Service/SortRankDataListInterface.php | 18 + src/Service/StopWordFilter.php | 23 - src/Strategy/PageRankStrategy.php | 163 ++-- .../RankingAlgorithmStrategyInterface.php | 29 +- src/TextRankFacade.php | 36 +- 49 files changed, 4986 insertions(+), 1186 deletions(-) create mode 100644 resource/stop-word/english.csv create mode 100644 resource/stop-word/french.csv create mode 100644 resource/stop-word/german.csv create mode 100644 resource/stop-word/italian.csv create mode 100644 resource/stop-word/norwegian.csv create mode 100644 resource/stop-word/russian.csv create mode 100644 resource/stop-word/spanish.csv create mode 100644 src/Builder/AlgorithmOutputBuilder.php rename src/Builder/{TextRankOutputBuilderInterface.php => AlgorithmOutputBuilderInterface.php} (57%) create mode 100644 src/Builder/StopWordCollectionBuilder.php create mode 100644 src/Builder/StopWordCollectionBuilderInterface.php delete mode 100644 src/Builder/TextRankOutputBuilder.php rename src/Data/{TextRankOutput.php => AlgorithmOutput.php} (85%) rename src/Data/{TextRankOutputInterface.php => AlgorithmOutputInterface.php} (52%) create mode 100644 src/Data/AlgorithmRequest.php create mode 100644 src/Data/AlgorithmRequestInterface.php rename src/Data/{TextRankOutput/OutputValue.php => RankDataObject.php} (80%) rename src/Data/{TextRankOutput/OutputValueInterface.php => RankDataObjectInterface.php} (74%) create mode 100644 src/Data/StopWordCollectionInterface.php create mode 100644 src/Exception/IoException.php create mode 100644 src/Exception/TextRankException.php create mode 100644 src/Factory/GeneralFactory.php create mode 100644 src/Factory/GeneralFactoryInterface.php create mode 100644 src/Service/CsvReader.php delete mode 100644 src/Service/GetTopNodes.php delete mode 100644 src/Service/GetTopNodesInterface.php create mode 100644 src/Service/ParserInterface.php create mode 100644 src/Service/ReaderInterface.php create mode 100644 src/Service/SortRankDataList.php create mode 100644 src/Service/SortRankDataListInterface.php delete mode 100644 src/Service/StopWordFilter.php diff --git a/.gitignore b/.gitignore index af0e4c2..fcab08f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ /.idea /.tmp +/tests/.phpunit.result.cache /composer.lock /vendor /var diff --git a/resource/stop-word/english.csv b/resource/stop-word/english.csv new file mode 100644 index 0000000..a338b15 --- /dev/null +++ b/resource/stop-word/english.csv @@ -0,0 +1,318 @@ +a +about +above +above +across +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +am +among +amongst +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anyway +anywhere +are +around +as +at +back +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +between +beyond +bill +both +bottom +but +by +call +can +cannot +cant +co +con +could +couldnt +cry +de +describe +detail +do +done +down +due +during +each +eg +eight +either +eleven +else +elsewhere +empty +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifteen +fify +fill +find +fire +first +five +for +former +formerly +forty +found +four +from +front +full +further +get +give +go +had +has +hasnt +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +herself +him +himself +his +how +however +hundred +ie +if +in +inc +indeed +interest +into +is +it +its +itself +keep +last +latter +latterly +least +less +ltd +made +many +may +me +meanwhile +might +mill +mine +more +moreover +most +mostly +move +much +must +my +myself +name +namely +neither +never +nevertheless +next +nine +no +nobody +none +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +other +others +otherwise +our +ours +ourselves +out +over +own +part +per +perhaps +please +put +rather +re +same +see +seem +seemed +seeming +seems +serious +several +she +should +show +side +since +sincere +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +such +system +take +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thickv +thin +third +this +those +though +three +through +throughout +thru +thus +to +together +too +top +toward +towards +twelve +twenty +two +un +under +until +up +upon +us +very +via +was +we +well +were +what +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +whoever +whole +whom +whose +why +will +with +within +without +would +yet +you +your +yours +yourself +yourselves diff --git a/resource/stop-word/french.csv b/resource/stop-word/french.csv new file mode 100644 index 0000000..3909d03 --- /dev/null +++ b/resource/stop-word/french.csv @@ -0,0 +1,689 @@ +a +abord +absolument +afin +ah +ai +aie +aient +aies +ailleurs +ainsi +ait +allaient +allo +allons +allô +alors +anterieur +anterieure +anterieures +apres +après +as +assez +attendu +au +aucun +aucune +aucuns +aujourd +aujourd\'hui +aupres +auquel +aura +aurai +auraient +aurais +aurait +auras +aurez +auriez +aurions +aurons +auront +aussi +autre +autrefois +autrement +autres +autrui +aux +auxquelles +auxquels +avaient +avais +avait +avant +avec +avez +aviez +avions +avoir +avons +ayant +ayez +ayons +b +bah +bas +basee +bat +beau +beaucoup +bien +bigre +bon +boum +bravo +brrr +c +car +ce +ceci +cela +celle +celle-ci +celle-là +celles +celles-ci +celles-là +celui +celui-ci +celui-là +celà +cent +cependant +certain +certaine +certaines +certains +certes +ces +cet +cette +ceux +ceux-ci +ceux-là +chacun +chacune +chaque +cher +chers +chez +chiche +chut +chère +chères +ci +cinq +cinquantaine +cinquante +cinquantième +cinquième +clac +clic +combien +comme +comment +comparable +comparables +compris +concernant +contre +couic +crac +d +da +dans +de +debout +dedans +dehors +deja +delà +depuis +dernier +derniere +derriere +derrière +des +desormais +desquelles +desquels +dessous +dessus +deux +deuxième +deuxièmement +devant +devers +devra +devrait +different +differentes +differents +différent +différente +différentes +différents +dire +directe +directement +dit +dite +dits +divers +diverse +diverses +dix +dix-huit +dix-neuf +dix-sept +dixième +doit +doivent +donc +dont +dos +douze +douzième +dring +droite +du +duquel +durant +dès +début +désormais +e +effet +egale +egalement +egales +eh +elle +elle-même +elles +elles-mêmes +en +encore +enfin +entre +envers +environ +es +essai +est +et +etant +etc +etre +eu +eue +eues +euh +eurent +eus +eusse +eussent +eusses +eussiez +eussions +eut +eux +eux-mêmes +exactement +excepté +extenso +exterieur +eûmes +eût +eûtes +f +fais +faisaient +faisant +fait +faites +façon +feront +fi +flac +floc +fois +font +force +furent +fus +fusse +fussent +fusses +fussiez +fussions +fut +fûmes +fût +fûtes +g +gens +h +ha +haut +hein +hem +hep +hi +ho +holà +hop +hormis +hors +hou +houp +hue +hui +huit +huitième +hum +hurrah +hé +hélas +i +ici +il +ils +importe +j +je +jusqu +jusque +juste +k +l +la +laisser +laquelle +las +le +lequel +les +lesquelles +lesquels +leur +leurs +longtemps +lors +lorsque +lui +lui-meme +lui-même +là +lès +m +ma +maint +maintenant +mais +malgre +malgré +maximale +me +meme +memes +merci +mes +mien +mienne +miennes +miens +mille +mince +mine +minimale +moi +moi-meme +moi-même +moindres +moins +mon +mot +moyennant +multiple +multiples +même +mêmes +n +na +naturel +naturelle +naturelles +ne +neanmoins +necessaire +necessairement +neuf +neuvième +ni +nombreuses +nombreux +nommés +non +nos +notamment +notre +nous +nous-mêmes +nouveau +nouveaux +nul +néanmoins +nôtre +nôtres +o +oh +ohé +ollé +olé +on +ont +onze +onzième +ore +ou +ouf +ouias +oust +ouste +outre +ouvert +ouverte +ouverts +o| +où +p +paf +pan +par +parce +parfois +parle +parlent +parler +parmi +parole +parseme +partant +particulier +particulière +particulièrement +pas +passé +pendant +pense +permet +personne +personnes +peu +peut +peuvent +peux +pff +pfft +pfut +pif +pire +pièce +plein +plouf +plupart +plus +plusieurs +plutôt +possessif +possessifs +possible +possibles +pouah +pour +pourquoi +pourrais +pourrait +pouvait +prealable +precisement +premier +première +premièrement +pres +probable +probante +procedant +proche +près +psitt +pu +puis +puisque +pur +pure +q +qu +quand +quant +quant-à-soi +quanta +quarante +quatorze +quatre +quatre-vingt +quatrième +quatrièmement +que +quel +quelconque +quelle +quelles +quelqu\'un +quelque +quelques +quels +qui +quiconque +quinze +quoi +quoique +r +rare +rarement +rares +relative +relativement +remarquable +rend +rendre +restant +reste +restent +restrictif +retour +revoici +revoilà +rien +s +sa +sacrebleu +sait +sans +sapristi +sauf +se +sein +seize +selon +semblable +semblaient +semble +semblent +sent +sept +septième +sera +serai +seraient +serais +serait +seras +serez +seriez +serions +serons +seront +ses +seul +seule +seulement +si +sien +sienne +siennes +siens +sinon +six +sixième +soi +soi-même +soient +sois +soit +soixante +sommes +son +sont +sous +souvent +soyez +soyons +specifique +specifiques +speculatif +stop +strictement +subtiles +suffisant +suffisante +suffit +suis +suit +suivant +suivante +suivantes +suivants +suivre +sujet +superpose +sur +surtout +t +ta +tac +tandis +tant +tardive +te +tel +telle +tellement +telles +tels +tenant +tend +tenir +tente +tes +tic +tien +tienne +tiennes +tiens +toc +toi +toi-même +ton +touchant +toujours +tous +tout +toute +toutefois +toutes +treize +trente +tres +trois +troisième +troisièmement +trop +très +tsoin +tsouin +tu +té +u +un +une +unes +uniformement +unique +uniques +uns +v +va +vais +valeur +vas +vers +via +vif +vifs +vingt +vivat +vive +vives +vlan +voici +voie +voient +voilà +vont +vos +votre +vous +vous-mêmes +vu +vé +vôtre +vôtres +w +x +y +z +zut +à +â +ça +ès +étaient +étais +était +étant +état +étiez +étions +été +étée +étées +étés +êtes +être +ô diff --git a/resource/stop-word/german.csv b/resource/stop-word/german.csv new file mode 100644 index 0000000..fce38c4 --- /dev/null +++ b/resource/stop-word/german.csv @@ -0,0 +1,598 @@ +ab +aber +alle +allein +allem +allen +aller +allerdings +allerlei +alles +allmählich +allzu +als +alsbald +also +am +an +and +ander +andere +anderem +anderen +anderer +andererseits +anderes +anderm +andern +andernfalls +anders +anstatt +auch +auf +aus +ausgenommen +ausser +ausserdem +außer +außerdem +außerhalb +bald +bei +beide +beiden +beiderlei +beides +beim +beinahe +bereits +besonders +besser +beträchtlich +bevor +bezüglich +bin +bis +bisher +bislang +bist +bloß +bsp. +bzw +ca +ca. +content +da +dabei +dadurch +dafür +dagegen +daher +dahin +damals +damit +danach +daneben +dann +daran +darauf +daraus +darin +darum +darunter +darüber +darüberhinaus +das +dass +dasselbe +davon +davor +dazu +daß +dein +deine +deinem +deinen +deiner +deines +dem +demnach +demselben +den +denen +denn +dennoch +denselben +der +derart +derartig +derem +deren +derer +derjenige +derjenigen +derselbe +derselben +derzeit +des +deshalb +desselben +dessen +desto +deswegen +dich +die +diejenige +dies +diese +dieselbe +dieselben +diesem +diesen +dieser +dieses +diesseits +dir +direkt +direkte +direkten +direkter +doch +dort +dorther +dorthin +drauf +drin +drunter +drüber +du +dunklen +durch +durchaus +eben +ebenfalls +ebenso +eher +eigenen +eigenes +eigentlich +ein +eine +einem +einen +einer +einerseits +eines +einfach +einführen +einführte +einführten +eingesetzt +einig +einige +einigem +einigen +einiger +einigermaßen +einiges +einmal +eins +einseitig +einseitige +einseitigen +einseitiger +einst +einstmals +einzig +entsprechend +entweder +er +erst +es +etc +etliche +etwa +etwas +euch +euer +eure +eurem +euren +eurer +eures +falls +fast +ferner +folgende +folgenden +folgender +folgendes +folglich +fuer +für +gab +ganze +ganzem +ganzen +ganzer +ganzes +gar +gegen +gemäss +ggf +gleich +gleichwohl +gleichzeitig +glücklicherweise +gänzlich +hab +habe +haben +haette +hast +hat +hatte +hatten +hattest +hattet +heraus +herein +hier +hier +hinter +hiermit +hiesige +hin +hinein +hinten +hinter +hinterher +http +hätt +hätte +hätten +höchstens +ich +igitt +ihm +ihn +ihnen +ihr +ihre +ihrem +ihren +ihrer +ihres +im +immer +immerhin +in +indem +indessen +infolge +innen +innerhalb +ins +insofern +inzwischen +irgend +irgendeine +irgendwas +irgendwen +irgendwer +irgendwie +irgendwo +ist +ja +je +jed +jede +jedem +jeden +jedenfalls +jeder +jederlei +jedes +jedoch +jemand +jene +jenem +jenen +jener +jenes +jenseits +jetzt +jährig +jährige +jährigen +jähriges +kam +kann +kannst +kaum +kein +keine +keinem +keinen +keiner +keinerlei +keines +keineswegs +klar +klare +klaren +klares +klein +kleinen +kleiner +kleines +koennen +koennt +koennte +koennten +komme +kommen +kommt +konkret +konkrete +konkreten +konkreter +konkretes +können +könnt +künftig +leider +machen +man +manche +manchem +manchen +mancher +mancherorts +manches +manchmal +mehr +mehrere +mein +meine +meinem +meinen +meiner +meines +mich +mir +mit +mithin +muessen +muesst +muesste +muss +musst +musste +mussten +muß +mußt +müssen +müsste +müssten +müßt +müßte +nach +nachdem +nachher +nachhinein +nahm +natürlich +neben +nebenan +nehmen +nein +nicht +nichts +nie +niemals +niemand +nirgends +nirgendwo +noch +nun +nur +nächste +nämlich +nötigenfalls +ob +oben +oberhalb +obgleich +obschon +obwohl +oder +oft +per +plötzlich +schließlich +schon +sehr +sehrwohl +seid +sein +seine +seinem +seinen +seiner +seines +seit +seitdem +seither +selber +selbst +sich +sicher +sicherlich +sie +sind +so +sobald +sodass +sodaß +soeben +sofern +sofort +sogar +solange +solch +solche +solchem +solchen +solcher +solches +soll +sollen +sollst +sollt +sollte +sollten +solltest +somit +sondern +sonst +sonstwo +sooft +soviel +soweit +sowie +sowohl +tatsächlich +tatsächlichen +tatsächlicher +tatsächliches +trotzdem +ueber +um +umso +unbedingt +und +unmöglich +unmögliche +unmöglichen +unmöglicher +uns +unser +unser +unsere +unsere +unserem +unseren +unserer +unseres +unter +usw +viel +viele +vielen +vieler +vieles +vielleicht +vielmals +vom +von +vor +voran +vorher +vorüber +völlig +wann +war +waren +warst +warum +was +weder +weil +weiter +weitere +weiterem +weiteren +weiterer +weiteres +weiterhin +weiß +welche +welchem +welchen +welcher +welches +wem +wen +wenig +wenige +weniger +wenigstens +wenn +wenngleich +wer +werde +werden +werdet +weshalb +wessen +wichtig +wie +wieder +wieso +wieviel +wiewohl +will +willst +wir +wird +wirklich +wirst +wo +wodurch +wogegen +woher +wohin +wohingegen +wohl +wohlweislich +womit +woraufhin +woraus +worin +wurde +wurden +während +währenddessen +wär +wäre +wären +würde +würden +z.B. +zB +zahlreich +zeitweise +zu +zudem +zuerst +zufolge +zugleich +zuletzt +zum +zumal +zur +zurück +zusammen +zuviel +zwar +zwischen +ähnlich +übel +über +überall +überallhin +überdies +übermorgen +übrig +übrigens diff --git a/resource/stop-word/italian.csv b/resource/stop-word/italian.csv new file mode 100644 index 0000000..bdedf2e --- /dev/null +++ b/resource/stop-word/italian.csv @@ -0,0 +1,660 @@ +a +abbastanza +abbia +abbiamo +abbiano +abbiate +accidenti +ad +adesso +affinche +agl +agli +ahime +ahim㨠+ahimè +ai +al +alcuna +alcuni +alcuno +all +alla +alle +allo +allora +altre +altri +altrimenti +altro +altrove +altrui +anche +ancora +anni +anno +ansa +anticipo +assai +attesa +attraverso +avanti +avemmo +avendo +avente +aver +avere +averlo +avesse +avessero +avessi +avessimo +aveste +avesti +avete +aveva +avevamo +avevano +avevate +avevi +avevo +avrai +avranno +avrebbe +avrebbero +avrei +avremmo +avremo +avreste +avresti +avrete +avrà +avrò +avuta +avute +avuti +avuto +basta +ben +bene +benissimo +berlusconi +brava +bravo +buono +c +casa +caso +cento +certa +certe +certi +certo +che +chi +chicchessia +chiunque +ci +ciascuna +ciascuno +cima +cinque +cio +cioe +cio㨠+cioè +circa +citta +città +cittã +ciã² +ciò +co +codesta +codesti +codesto +cogli +coi +col +colei +coll +coloro +colui +come +cominci +comprare +comunque +con +concernente +conciliarsi +conclusione +consecutivi +consecutivo +consiglio +contro +cortesia +cos +cosa +cosi +cos㬠+così +cui +d +da +dagl +dagli +dai +dal +dall +dalla +dalle +dallo +dappertutto +davanti +degl +degli +dei +del +dell +della +delle +dello +dentro +detto +deve +devo +di +dice +dietro +dire +dirimpetto +diventa +diventare +diventato +dopo +doppio +dov +dove +dovra +dovrà +dovrã +dovunque +due +dunque +durante +e +ebbe +ebbero +ebbi +ecc +ecco +ed +effettivamente +egli +ella +entrambi +eppure +era +erano +eravamo +eravate +eri +ero +esempio +esse +essendo +esser +essere +essi +ex +fa +faccia +facciamo +facciano +facciate +faccio +facemmo +facendo +facesse +facessero +facessi +facessimo +faceste +facesti +faceva +facevamo +facevano +facevate +facevi +facevo +fai +fanno +farai +faranno +fare +farebbe +farebbero +farei +faremmo +faremo +fareste +faresti +farete +farà +farò +fatto +favore +fece +fecero +feci +fin +finalmente +finche +fine +fino +forse +forza +fosse +fossero +fossi +fossimo +foste +fosti +fra +frattempo +fu +fui +fummo +fuori +furono +futuro +generale +gente +gia +giacche +giorni +giorno +giu +già +giã +gli +gliela +gliele +glieli +glielo +gliene +governo +grande +grazie +gruppo +ha +haha +hai +hanno +ho +i +ie +ieri +il +improvviso +in +inc +indietro +infatti +inoltre +insieme +intanto +intorno +invece +io +l +la +lasciato +lato +lavoro +le +lei +li +lo +lontano +loro +lui +lungo +luogo +là +lã +ma +macche +magari +maggior +mai +male +malgrado +malissimo +mancanza +marche +me +medesimo +mediante +meglio +meno +mentre +mesi +mezzo +mi +mia +mie +miei +mila +miliardi +milioni +minimi +ministro +mio +modo +molta +molti +moltissimo +molto +momento +mondo +mosto +nazionale +ne +negl +negli +nei +nel +nell +nella +nelle +nello +nemmeno +neppure +nessun +nessuna +nessuno +niente +no +noi +nome +non +nondimeno +nonostante +nonsia +nostra +nostre +nostri +nostro +novanta +nove +nulla +nuovi +nuovo +o +od +oggi +ogni +ognuna +ognuno +oltre +oppure +ora +ore +osi +ossia +ottanta +otto +paese +parecchi +parecchie +parecchio +parte +partendo +peccato +peggio +per +perche +perch㨠+perchè +perché +percio +perciã² +perciò +perfino +pero +persino +persone +perã² +però +piedi +pieno +piglia +piu +piuttosto +piã¹ +più +po +pochissimo +poco +poi +poiche +possa +possedere +posteriore +posto +potrebbe +preferibilmente +presa +press +prima +primo +principalmente +probabilmente +promesso +proprio +puo +pure +purtroppo +puã² +può +qua +qualche +qualcosa +qualcuna +qualcuno +quale +quali +qualunque +quando +quanta +quante +quanti +quanto +quantunque +quarto +quasi +quattro +quel +quella +quelle +quelli +quello +quest +questa +queste +questi +questo +qui +quindi +quinto +realmente +recente +recentemente +registrazione +relativo +riecco +rispetto +salvo +sara +sarai +saranno +sarebbe +sarebbero +sarei +saremmo +saremo +sareste +saresti +sarete +sarà +sarã +sarò +scola +scopo +scorso +se +secondo +seguente +seguito +sei +sembra +sembrare +sembrato +sembrava +sembri +sempre +senza +sette +si +sia +siamo +siano +siate +siete +sig +solito +solo +soltanto +sono +sopra +soprattutto +sotto +spesso +srl +sta +stai +stando +stanno +starai +staranno +starebbe +starebbero +starei +staremmo +staremo +stareste +staresti +starete +starà +starò +stata +state +stati +stato +stava +stavamo +stavano +stavate +stavi +stavo +stemmo +stessa +stesse +stessero +stessi +stessimo +stesso +steste +stesti +stette +stettero +stetti +stia +stiamo +stiano +stiate +sto +su +sua +subito +successivamente +successivo +sue +sugl +sugli +sui +sul +sull +sulla +sulle +sullo +suo +suoi +tale +tali +talvolta +tanto +te +tempo +terzo +th +ti +titolo +torino +tra +tranne +tre +trenta +triplo +troppo +trovato +tu +tua +tue +tuo +tuoi +tutta +tuttavia +tutte +tutti +tutto +uguali +ulteriore +ultimo +un +una +uno +uomo +va +vai +vale +vari +varia +varie +vario +verso +vi +via +vicino +visto +vita +voi +volta +volte +vostra +vostre +vostri +vostro +㨠+è diff --git a/resource/stop-word/norwegian.csv b/resource/stop-word/norwegian.csv new file mode 100644 index 0000000..e93528f --- /dev/null +++ b/resource/stop-word/norwegian.csv @@ -0,0 +1,221 @@ +alle +andre +arbeid +at +av +bare +begge +ble +blei +bli +blir +blitt +bort +bra +bruke +både +båe +da +de +deg +dei +deim +deira +deires +dem +den +denne +der +dere +deres +det +dette +di +din +disse +ditt +du +dykk +dykkar +då +eg +ein +eit +eitt +eller +elles +en +ene +eneste +enhver +enn +er +et +ett +etter +folk +for +fordi +forsûke +fra +få +før +fûr +fûrst +gjorde +gjûre +god +gå +ha +hadde +han +hans +har +hennar +henne +hennes +her +hjå +ho +hoe +honom +hoss +hossen +hun +hva +hvem +hver +hvilke +hvilken +hvis +hvor +hvordan +hvorfor +i +ikke +ikkje +ingen +ingi +inkje +inn +innen +inni +ja +jeg +kan +kom +korleis +korso +kun +kunne +kva +kvar +kvarhelst +kven +kvi +kvifor +lage +lang +lik +like +makt +man +mange +me +med +medan +meg +meget +mellom +men +mens +mer +mest +mi +min +mine +mitt +mot +mye +mykje +må +måte +navn +ned +nei +no +noe +noen +noka +noko +nokon +nokor +nokre +ny +nå +når +og +også +om +opp +oss +over +part +punkt +på +rett +riktig +samme +sant +seg +selv +si +sia +sidan +siden +sin +sine +sist +sitt +sjøl +skal +skulle +slik +slutt +so +som +somme +somt +start +stille +så +sånn +tid +til +tilbake +tilstand +um +under +upp +ut +uten +var +vart +varte +ved +verdi +vere +verte +vi +vil +ville +vite +vore +vors +vort +vår +være +vært +vöre +vört +å diff --git a/resource/stop-word/russian.csv b/resource/stop-word/russian.csv new file mode 100644 index 0000000..38c4f2a --- /dev/null +++ b/resource/stop-word/russian.csv @@ -0,0 +1,559 @@ +c +а +алло +без +белый +близко +более +больше +большой +будем +будет +будете +будешь +будто +буду +будут +будь +бы +бывает +бывь +был +была +были +было +быть +в +важная +важное +важные +важный +вам +вами +вас +ваш +ваша +ваше +ваши +вверх +вдали +вдруг +ведь +везде +вернуться +весь +вечер +взгляд +взять +вид +видел +видеть +вместе +вне +вниз +внизу +во +вода +война +вокруг +вон +вообще +вопрос +восемнадцатый +восемнадцать +восемь +восьмой +вот +впрочем +времени +время +все +все еще +всегда +всего +всем +всеми +всему +всех +всею +всю +всюду +вся +всё +второй +вы +выйти +г +где +главный +глаз +говорил +говорит +говорить +год +года +году +голова +голос +город +да +давать +давно +даже +далекий +далеко +дальше +даром +дать +два +двадцатый +двадцать +две +двенадцатый +двенадцать +дверь +двух +девятнадцатый +девятнадцать +девятый +девять +действительно +дел +делал +делать +делаю +дело +день +деньги +десятый +десять +для +до +довольно +долго +должен +должно +должный +дом +дорога +друг +другая +другие +других +друго +другое +другой +думать +душа +е +его +ее +ей +ему +если +есть +еще +ещё +ею +её +ж +ждать +же +жена +женщина +жизнь +жить +за +занят +занята +занято +заняты +затем +зато +зачем +здесь +земля +знать +значит +значить +и +иди +идти +из +или +им +имеет +имел +именно +иметь +ими +имя +иногда +их +к +каждая +каждое +каждые +каждый +кажется +казаться +как +какая +какой +кем +книга +когда +кого +ком +комната +кому +конец +конечно +которая +которого +которой +которые +который +которых +кроме +кругом +кто +куда +лежать +лет +ли +лицо +лишь +лучше +любить +люди +м +маленький +мало +мать +машина +между +меля +менее +меньше +меня +место +миллионов +мимо +минута +мир +мира +мне +много +многочисленная +многочисленное +многочисленные +многочисленный +мной +мною +мог +могу +могут +мож +может +может быть +можно +можхо +мои +мой +мор +москва +мочь +моя +моё +мы +на +наверху +над +надо +назад +наиболее +найти +наконец +нам +нами +народ +нас +начала +начать +наш +наша +наше +наши +не +него +недавно +недалеко +нее +ней +некоторый +нельзя +нем +немного +нему +непрерывно +нередко +несколько +нет +нею +неё +ни +нибудь +ниже +низко +никакой +никогда +никто +никуда +ним +ними +них +ничего +ничто +но +новый +нога +ночь +ну +нужно +нужный +нх +о +об +оба +обычно +один +одиннадцатый +одиннадцать +однажды +однако +одного +одной +оказаться +окно +около +он +она +они +оно +опять +особенно +остаться +от +ответить +отец +откуда +отовсюду +отсюда +очень +первый +перед +писать +плечо +по +под +подойди +подумать +пожалуйста +позже +пойти +пока +пол +получить +помнить +понимать +понять +пор +пора +после +последний +посмотреть +посреди +потом +потому +почему +почти +правда +прекрасно +при +про +просто +против +процентов +путь +пятнадцатый +пятнадцать +пятый +пять +работа +работать +раз +разве +рано +раньше +ребенок +решить +россия +рука +русский +ряд +рядом +с +с кем +сам +сама +сами +самим +самими +самих +само +самого +самой +самом +самому +саму +самый +свет +свое +своего +своей +свои +своих +свой +свою +сделать +сеаой +себе +себя +сегодня +седьмой +сейчас +семнадцатый +семнадцать +семь +сидеть +сила +сих +сказал +сказала +сказать +сколько +слишком +слово +случай +смотреть +сначала +снова +со +собой +собою +советский +совсем +спасибо +спросить +сразу +стал +старый +стать +стол +сторона +стоять +страна +суть +считать +т +та +так +такая +также +таки +такие +такое +такой +там +твои +твой +твоя +твоё +те +тебе +тебя +тем +теми +теперь +тех +то +тобой +тобою +товарищ +тогда +того +тоже +только +том +тому +тот +тою +третий +три +тринадцатый +тринадцать +ту +туда +тут +ты +тысяч +у +увидеть +уж +уже +улица +уметь +утро +хороший +хорошо +хотел бы +хотеть +хоть +хотя +хочешь +час +часто +часть +чаще +чего +человек +чем +чему +через +четвертый +четыре +четырнадцатый +четырнадцать +что +чтоб +чтобы +чуть +шестнадцатый +шестнадцать +шестой +шесть +эта +эти +этим +этими +этих +это +этого +этой +этом +этому +этот +эту +я +являюсь diff --git a/resource/stop-word/spanish.csv b/resource/stop-word/spanish.csv new file mode 100644 index 0000000..3b44c2c --- /dev/null +++ b/resource/stop-word/spanish.csv @@ -0,0 +1,721 @@ +a +actualmente +acuerdo +adelante +ademas +además +adrede +afirmó +agregó +ahi +ahora +ahí +al +algo +alguna +algunas +alguno +algunos +algún +alli +allí +alrededor +ambos +ampleamos +antano +antaño +ante +anterior +antes +apenas +aproximadamente +aquel +aquella +aquellas +aquello +aquellos +aqui +aquél +aquélla +aquéllas +aquéllos +aquí +arriba +arribaabajo +aseguró +asi +así +atras +aun +aunque +ayer +añadió +aún +b +bajo +bastante +bien +breve +buen +buena +buenas +bueno +buenos +c +cada +casi +cerca +cierta +ciertas +cierto +ciertos +cinco +claro +comentó +como +con +conmigo +conocer +conseguimos +conseguir +considera +consideró +consigo +consigue +consiguen +consigues +contigo +contra +cosas +creo +cual +cuales +cualquier +cuando +cuanta +cuantas +cuanto +cuantos +cuatro +cuenta +cuál +cuáles +cuándo +cuánta +cuántas +cuánto +cuántos +cómo +d +da +dado +dan +dar +de +debajo +debe +deben +debido +decir +dejó +del +delante +demasiado +demás +dentro +deprisa +desde +despacio +despues +después +detras +detrás +dia +dias +dice +dicen +dicho +dieron +diferente +diferentes +dijeron +dijo +dio +donde +dos +durante +día +días +dónde +e +ejemplo +el +ella +ellas +ello +ellos +embargo +empleais +emplean +emplear +empleas +empleo +en +encima +encuentra +enfrente +enseguida +entonces +entre +era +erais +eramos +eran +eras +eres +es +esa +esas +ese +eso +esos +esta +estaba +estabais +estaban +estabas +estad +estada +estadas +estado +estados +estais +estamos +estan +estando +estar +estaremos +estará +estarán +estarás +estaré +estaréis +estaría +estaríais +estaríamos +estarían +estarías +estas +este +estemos +esto +estos +estoy +estuve +estuviera +estuvierais +estuvieran +estuvieras +estuvieron +estuviese +estuvieseis +estuviesen +estuvieses +estuvimos +estuviste +estuvisteis +estuviéramos +estuviésemos +estuvo +está +estábamos +estáis +están +estás +esté +estéis +estén +estés +ex +excepto +existe +existen +explicó +expresó +f +fin +final +fue +fuera +fuerais +fueran +fueras +fueron +fuese +fueseis +fuesen +fueses +fui +fuimos +fuiste +fuisteis +fuéramos +fuésemos +g +general +gran +grandes +gueno +h +ha +haber +habia +habida +habidas +habido +habidos +habiendo +habla +hablan +habremos +habrá +habrán +habrás +habré +habréis +habría +habríais +habríamos +habrían +habrías +habéis +había +habíais +habíamos +habían +habías +hace +haceis +hacemos +hacen +hacer +hacerlo +haces +hacia +haciendo +hago +han +has +hasta +hay +haya +hayamos +hayan +hayas +hayáis +he +hecho +hemos +hicieron +hizo +horas +hoy +hube +hubiera +hubierais +hubieran +hubieras +hubieron +hubiese +hubieseis +hubiesen +hubieses +hubimos +hubiste +hubisteis +hubiéramos +hubiésemos +hubo +i +igual +incluso +indicó +informo +informó +intenta +intentais +intentamos +intentan +intentar +intentas +intento +ir +j +junto +k +l +la +lado +largo +las +le +lejos +les +llegó +lleva +llevar +lo +los +luego +lugar +m +mal +manera +manifestó +mas +mayor +me +mediante +medio +mejor +mencionó +menos +menudo +mi +mia +mias +mientras +mio +mios +mis +misma +mismas +mismo +mismos +modo +momento +mucha +muchas +mucho +muchos +muy +más +mí +mía +mías +mío +míos +n +nada +nadie +ni +ninguna +ningunas +ninguno +ningunos +ningún +no +nos +nosotras +nosotros +nuestra +nuestras +nuestro +nuestros +nueva +nuevas +nuevo +nuevos +nunca +o +ocho +os +otra +otras +otro +otros +p +pais +para +parece +parte +partir +pasada +pasado +paìs +peor +pero +pesar +poca +pocas +poco +pocos +podeis +podemos +poder +podria +podriais +podriamos +podrian +podrias +podrá +podrán +podría +podrían +poner +por +por qué +porque +posible +primer +primera +primero +primeros +principalmente +pronto +propia +propias +propio +propios +proximo +próximo +próximos +pudo +pueda +puede +pueden +puedo +pues +q +qeu +que +quedó +queremos +quien +quienes +quiere +quiza +quizas +quizá +quizás +quién +quiénes +qué +r +raras +realizado +realizar +realizó +repente +respecto +s +sabe +sabeis +sabemos +saben +saber +sabes +sal +salvo +se +sea +seamos +sean +seas +segun +segunda +segundo +según +seis +ser +sera +seremos +será +serán +serás +seré +seréis +sería +seríais +seríamos +serían +serías +seáis +señaló +si +sido +siempre +siendo +siete +sigue +siguiente +sin +sino +sobre +sois +sola +solamente +solas +solo +solos +somos +son +soy +soyos +su +supuesto +sus +suya +suyas +suyo +suyos +sé +sí +sólo +t +tal +tambien +también +tampoco +tan +tanto +tarde +te +temprano +tendremos +tendrá +tendrán +tendrás +tendré +tendréis +tendría +tendríais +tendríamos +tendrían +tendrías +tened +teneis +tenemos +tener +tenga +tengamos +tengan +tengas +tengo +tengáis +tenida +tenidas +tenido +tenidos +teniendo +tenéis +tenía +teníais +teníamos +tenían +tenías +tercera +ti +tiempo +tiene +tienen +tienes +toda +todas +todavia +todavía +todo +todos +total +trabaja +trabajais +trabajamos +trabajan +trabajar +trabajas +trabajo +tras +trata +través +tres +tu +tus +tuve +tuviera +tuvierais +tuvieran +tuvieras +tuvieron +tuviese +tuvieseis +tuviesen +tuvieses +tuvimos +tuviste +tuvisteis +tuviéramos +tuviésemos +tuvo +tuya +tuyas +tuyo +tuyos +tú +u +ultimo +un +una +unas +uno +unos +usa +usais +usamos +usan +usar +usas +uso +usted +ustedes +v +va +vais +valor +vamos +van +varias +varios +vaya +veces +ver +verdad +verdadera +verdadero +vez +vosotras +vosotros +voy +vuestra +vuestras +vuestro +vuestros +w +x +y +ya +yo +z +él +éramos +ésa +ésas +ése +ésos +ésta +éstas +éste +éstos +última +últimas +último +últimos diff --git a/src/Builder/AlgorithmOutputBuilder.php b/src/Builder/AlgorithmOutputBuilder.php new file mode 100644 index 0000000..ecf11dd --- /dev/null +++ b/src/Builder/AlgorithmOutputBuilder.php @@ -0,0 +1,64 @@ +sortRankDataList = $getTopNodes; + } + + public function build( + TextInterface $text, + NodeCollectionInterface $nodeCollection, + array $sentences, + int $maxKeywords, + int $maxSentences + ): AlgorithmOutputInterface { + $words = $this->createWordList($text, $nodeCollection, $maxKeywords); + $sentences = $this->sortRankDataList->sort($sentences); + + $textRankOutput = new AlgorithmOutput(); + $textRankOutput->setKeyWords($words); + $textRankOutput->setSentences($sentences); + + return $textRankOutput; + } + + private function createWordList( + TextInterface $text, + NodeCollectionInterface $nodeCollection, + int $maxKeywords + ): array { + $nodes = $this + ->sortRankDataList + ->sort(array_values($nodeCollection->getNodes())); + $words = []; + + for ($i = 0; $i < $maxKeywords; $i++) { + $nodeId = $nodes[$i]->getId(); + $token = $text->getTokenMap()->getToken($nodeId); + $word = new RankDataObject(); + $word->setId($nodeId); + $word->setValue($token); + $word->setRank($nodes[$i]->getRank()); + + $words[] = $word; + } + + return $words; + } +} diff --git a/src/Builder/TextRankOutputBuilderInterface.php b/src/Builder/AlgorithmOutputBuilderInterface.php similarity index 57% rename from src/Builder/TextRankOutputBuilderInterface.php rename to src/Builder/AlgorithmOutputBuilderInterface.php index 4c42fc6..b556aee 100644 --- a/src/Builder/TextRankOutputBuilderInterface.php +++ b/src/Builder/AlgorithmOutputBuilderInterface.php @@ -1,18 +1,20 @@ -getSentences() as $sentence) { - foreach ($sentence->getVector() as $index => $tokenId) { - if (!isset($dataSource[$tokenId])) { - $dataSource[$tokenId] = [ - self::ID => $tokenId, - self::LEFT => [], - self::RIGHT => [] - ]; - } - - if ($sentence->isIndexExists($index - 1)) { - $previousTokenId = $sentence->getTokenId($index - 1); - if ($text->getTokenMap()->isExists($previousTokenId)) { - $dataSource[$tokenId][self::LEFT][] = $previousTokenId; - } - } - - if ($sentence->isIndexExists($index + 1)) { - $nextTokenId = $sentence->getTokenId($index + 1); - if ($text->getTokenMap()->isExists($nextTokenId)) { - $dataSource[$tokenId][self::RIGHT][] = $nextTokenId; - } - } - } - } - - return $dataSource; - } -} +getSentences() as $sentence) { + foreach ($sentence->getVector() as $index => $tokenId) { + if (!isset($dataSource[$tokenId])) { + $dataSource[$tokenId] = [ + self::ID => $tokenId, + self::LEFT => [], + self::RIGHT => [] + ]; + } + + if ($sentence->isIndexExists($index - 1)) { + $previousTokenId = $sentence->getTokenId($index - 1); + if ($text->getTokenMap()->isExists($previousTokenId)) { + $dataSource[$tokenId][self::LEFT][] = $previousTokenId; + } + } + + if ($sentence->isIndexExists($index + 1)) { + $nextTokenId = $sentence->getTokenId($index + 1); + if ($text->getTokenMap()->isExists($nextTokenId)) { + $dataSource[$tokenId][self::RIGHT][] = $nextTokenId; + } + } + } + } + + return $dataSource; + } +} diff --git a/src/Builder/StopWordCollectionBuilder.php b/src/Builder/StopWordCollectionBuilder.php new file mode 100644 index 0000000..dd78e46 --- /dev/null +++ b/src/Builder/StopWordCollectionBuilder.php @@ -0,0 +1,31 @@ +reader = $reader; + } + + public function build(string $path): StopWordCollectionInterface + { + $words = []; + + foreach ($this->reader->read($path) as $row) { + $words[] = current($row); + } + + return new StopWordCollection($words); + } +} diff --git a/src/Builder/StopWordCollectionBuilderInterface.php b/src/Builder/StopWordCollectionBuilderInterface.php new file mode 100644 index 0000000..0bbb820 --- /dev/null +++ b/src/Builder/StopWordCollectionBuilderInterface.php @@ -0,0 +1,12 @@ + $sentenceTokenList) { - $sentenceVector = []; - foreach ($sentenceTokenList as $token) { - $token = (string)$token; - if (!isset($tokens[$token])) { - $tokens[$token] = $i; - $tokenId = $i; - $i++; - } else { - $tokenId = $tokens[$token]; - } - - $sentenceVector[] = $tokenId; - } - - $sentence = new Sentence(); - $sentence->setId($sentenceIndex); - $sentence->setVector($sentenceVector); - $sentence->setOriginalValue($originalSentences[$sentenceIndex]); - $sentences[] = $sentence; - } - - $tokenMap = new TokenMap(); - $tokenMap->setTokenMap( - array_map( - 'strval', - array_flip($tokens) - ) - ); - - return new Text( - $tokenMap, - $sentences - ); - } -} + $sentenceTokenList) { + $sentenceVector = []; + foreach ($sentenceTokenList as $token) { + $token = (string)$token; + if (!isset($tokens[$token])) { + $tokens[$token] = $i; + $tokenId = $i; + $i++; + } else { + $tokenId = $tokens[$token]; + } + + $sentenceVector[] = $tokenId; + } + + $sentence = new Sentence(); + $sentence->setId($sentenceIndex); + $sentence->setVector($sentenceVector); + $sentence->setOriginalValue($originalSentences[$sentenceIndex]); + $sentences[] = $sentence; + } + + $tokenMap = new TokenMap(); + $tokenMap->setTokenMap( + array_map( + 'strval', + array_flip($tokens) + ) + ); + + return new Text( + $tokenMap, + $sentences + ); + } +} diff --git a/src/Builder/TextBuilderInterface.php b/src/Builder/TextBuilderInterface.php index 1f7d47f..3bf18b3 100644 --- a/src/Builder/TextBuilderInterface.php +++ b/src/Builder/TextBuilderInterface.php @@ -1,12 +1,12 @@ -getTopNodes = $getTopNodes; - $this->sentenceWeighting = $sentenceWeighting; - } - - public function build( - TextInterface $text, - NodeCollectionInterface $nodeCollection, - int $maxKeywords - ): TextRankOutputInterface { - $nodes = $this->getTopNodes->execute($nodeCollection, $maxKeywords); - $words = []; - - foreach ($nodes as $node) { - $token = $text - ->getTokenMap() - ->getToken($node->getId()); - $word = new OutputValue(); - $word->setId($node->getId()); - $word->setValue($token); - $word->setRank($node->getRank()); - - $words[] = $word; - } - - $textRankOutput = new TextRankOutput(); - $textRankOutput->setKeyWords(array_slice($words, 0, $maxKeywords)); - - $sentences = $this->sentenceWeighting->weight($text, $words); - - - - $textRankOutput->setSentences($sentences); - - - return $textRankOutput; - } -} diff --git a/src/Data/TextRankOutput.php b/src/Data/AlgorithmOutput.php similarity index 85% rename from src/Data/TextRankOutput.php rename to src/Data/AlgorithmOutput.php index ce47e08..105ba6f 100644 --- a/src/Data/TextRankOutput.php +++ b/src/Data/AlgorithmOutput.php @@ -1,31 +1,31 @@ -keyWords = $keywords; - } - - public function getKeyWords(): ?array - { - return $this->keyWords; - } - - public function setSentences(array $sentences): void - { - $this->sentences = $sentences; - } - - public function getSentences(): ?array - { - return $this->sentences; - } -} +keyWords = $keywords; + } + + public function getKeyWords(): ?array + { + return $this->keyWords; + } + + public function setSentences(array $sentences): void + { + $this->sentences = $sentences; + } + + public function getSentences(): ?array + { + return $this->sentences; + } +} diff --git a/src/Data/TextRankOutputInterface.php b/src/Data/AlgorithmOutputInterface.php similarity index 52% rename from src/Data/TextRankOutputInterface.php rename to src/Data/AlgorithmOutputInterface.php index 6a5fe1c..0a10c9e 100644 --- a/src/Data/TextRankOutputInterface.php +++ b/src/Data/AlgorithmOutputInterface.php @@ -1,31 +1,29 @@ -stopWordCsvPath = $stopWordCsvPath; + $this->maxKeywords = $maxKeywords; + $this->maxKeySentences = $maxKeySentences; + $this->pageRankPowerIteration = $pageRankPowerIteration; + } + + public function getStopWordCsvPath(): string + { + return $this->stopWordCsvPath; + } + + public function setStopWordCsvPath(string $stopWordCsvPath): void + { + $this->stopWordCsvPath = $stopWordCsvPath; + } + + public function getRawText(): string + { + return $this->rawText; + } + + public function setRawText(string $rawText): void + { + $this->rawText = $rawText; + } + + public function getMaxKeywords(): int + { + return $this->maxKeywords; + } + + public function setMaxKeywords(int $maxKeywords): void + { + $this->maxKeywords = $maxKeywords; + } + + public function getMaxKeySentences(): int + { + return $this->maxKeySentences; + } + + public function setMaxKeySentences(int $maxKeySentences): void + { + $this->maxKeySentences = $maxKeySentences; + } + + public function getPageRankPowerIteration(): int + { + return $this->pageRankPowerIteration; + } + + public function setPageRankPowerIteration(int $pageRankPowerIteration): void + { + $this->pageRankPowerIteration = $pageRankPowerIteration; + } +} diff --git a/src/Data/AlgorithmRequestInterface.php b/src/Data/AlgorithmRequestInterface.php new file mode 100644 index 0000000..a6e0aa1 --- /dev/null +++ b/src/Data/AlgorithmRequestInterface.php @@ -0,0 +1,58 @@ +id = $id; - } - - public function getId(): int - { - return $this->id; - } - - public function setValue(string $value): void - { - $this->value = $value; - } - - public function getValue(): string - { - return $this->value; - } - - public function setRank(float $rank): void - { - $this->rank = $rank; - } - - public function getRank(): float - { - return $this->rank; - } -} +id = $id; + } + + public function getId(): int + { + return $this->id; + } + + public function setValue(string $value): void + { + $this->value = $value; + } + + public function getValue(): string + { + return $this->value; + } + + public function setRank(float $rank): void + { + $this->rank = $rank; + } + + public function getRank(): float + { + return $this->rank; + } +} diff --git a/src/Data/TextRankOutput/OutputValueInterface.php b/src/Data/RankDataObjectInterface.php similarity index 74% rename from src/Data/TextRankOutput/OutputValueInterface.php rename to src/Data/RankDataObjectInterface.php index e3458d3..88f0d5d 100644 --- a/src/Data/TextRankOutput/OutputValueInterface.php +++ b/src/Data/RankDataObjectInterface.php @@ -1,20 +1,20 @@ -words = $words; + } + + public function isExist(string $word): bool + { + return array_search($word, $this->words) !== false; + } +} diff --git a/src/Data/StopWordCollectionInterface.php b/src/Data/StopWordCollectionInterface.php new file mode 100644 index 0000000..595e568 --- /dev/null +++ b/src/Data/StopWordCollectionInterface.php @@ -0,0 +1,10 @@ +tokenMap = $tokenMap; - $this->sentences = $sentences; - } - - public function getTokenMap(): TokenMapInterface - { - return $this->tokenMap; - } - - public function getSentences(): array - { - return $this->sentences; - } -} +tokenMap = $tokenMap; + $this->sentences = $sentences; + } + + public function getTokenMap(): TokenMapInterface + { + return $this->tokenMap; + } + + public function getSentences(): array + { + return $this->sentences; + } +} diff --git a/src/Data/Text/Sentence.php b/src/Data/Text/Sentence.php index 69d2385..ee947a4 100644 --- a/src/Data/Text/Sentence.php +++ b/src/Data/Text/Sentence.php @@ -1,52 +1,52 @@ -id = $id; - } - - public function getId(): int - { - return $this->id; - } - - public function setOriginalValue(string $originalValue): void - { - $this->originalValue = $originalValue; - } - - public function getOriginalValue(): string - { - return $this->originalValue; - } - - public function setVector(array $vector): void - { - $this->vector = $vector; - } - - public function getVector(): array - { - return $this->vector; - } - - public function isIndexExists(int $index): bool - { - return isset($this->vector[$index]); - } - - public function getTokenId(int $index): int - { - return $this->vector[$index]; - } -} +id = $id; + } + + public function getId(): int + { + return $this->id; + } + + public function setOriginalValue(string $originalValue): void + { + $this->originalValue = $originalValue; + } + + public function getOriginalValue(): string + { + return $this->originalValue; + } + + public function setVector(array $vector): void + { + $this->vector = $vector; + } + + public function getVector(): array + { + return $this->vector; + } + + public function isIndexExists(int $index): bool + { + return isset($this->vector[$index]); + } + + public function getTokenId(int $index): int + { + return $this->vector[$index]; + } +} diff --git a/src/Data/Text/SentenceInterface.php b/src/Data/Text/SentenceInterface.php index d8a5b17..c239ffe 100644 --- a/src/Data/Text/SentenceInterface.php +++ b/src/Data/Text/SentenceInterface.php @@ -1,30 +1,30 @@ -tokenMap = $tokenMap; - } - - public function isExists(int $tokenId): bool - { - return isset($this->tokenMap[$tokenId]); - } - - public function getToken(int $tokenId): string - { - return $this->tokenMap[$tokenId]; - } -} +tokenMap = $tokenMap; + } + + public function isExists(int $tokenId): bool + { + return isset($this->tokenMap[$tokenId]); + } + + public function getToken(int $tokenId): string + { + return $this->tokenMap[$tokenId]; + } +} diff --git a/src/Data/Text/TokenMapInterface.php b/src/Data/Text/TokenMapInterface.php index bf54db0..d966eb5 100644 --- a/src/Data/Text/TokenMapInterface.php +++ b/src/Data/Text/TokenMapInterface.php @@ -1,17 +1,17 @@ -parser = $parser; - $this->rankingAlgorithmStrategy = $rankingAlgorithmStrategy; - $this->textRankOutputBuilder = $textRankOutputBuilder; - } - - public function getKeywords( - string $rawText, - int $maxKeywords - ): TextRankOutputInterface { - - $text = $this->parser->parse($rawText); - $nodeCollection = $this->rankingAlgorithmStrategy->rank($text); - - return $this->textRankOutputBuilder->build( - $text, - $nodeCollection, - $maxKeywords - ); - } -} +parser = $generalFactory->createParser(); + $this->pageRankAlgorithm = $generalFactory->createAlgorithmStrategy(); + $this->algorithmOutputBuilder = $generalFactory->createAlgorithmBuilder(); + $this->sentenceWeighting = $generalFactory->createSentenceWeighting(); + } + + public function rank( + AlgorithmRequestInterface $algorithmRequest + ): AlgorithmOutputInterface { + + $text = $this->parser->parse( + $algorithmRequest->getRawText(), + $algorithmRequest->getStopWordCsvPath() + ); + + $nodeCollection = $this->pageRankAlgorithm->rank( + $text, + $algorithmRequest->getPageRankPowerIteration() + ); + + $sentences = $this->sentenceWeighting->weight( + $text, + $nodeCollection + ); + + return $this->algorithmOutputBuilder->build( + $text, + $nodeCollection, + $sentences, + $algorithmRequest->getMaxKeywords(), + $algorithmRequest->getMaxKeySentences() + ); + } +} diff --git a/src/Factory/GeneralFactory.php b/src/Factory/GeneralFactory.php new file mode 100644 index 0000000..5394225 --- /dev/null +++ b/src/Factory/GeneralFactory.php @@ -0,0 +1,59 @@ +getResource($path); + + while (false !== ($row = fgetcsv($resource))) { + yield array_values($row); + } + + fclose($resource); + } + + private function getResource(string $path) + { + $resource = fopen($path, 'r'); + + if (false === $resource) { + throw new IoException(sprintf('Can\'t read file [%s]', $path)); + } + + return $resource; + } +} diff --git a/src/Service/GetTopNodes.php b/src/Service/GetTopNodes.php deleted file mode 100644 index 1b201d2..0000000 --- a/src/Service/GetTopNodes.php +++ /dev/null @@ -1,30 +0,0 @@ -getNodes()); - $size = count($nodes); - - for ($i = 0; $i < $size; $i++) { - for ($j = 0; $j < $size; $j++) { - if ($nodes[$i]->getRank() > $nodes[$j]->getRank()) { - $tmp = $nodes[$i]; - $nodes[$i] = $nodes[$j]; - $nodes[$j] = $tmp; - } - } - } - - return $nodes; - } -} diff --git a/src/Service/GetTopNodesInterface.php b/src/Service/GetTopNodesInterface.php deleted file mode 100644 index 6e257bc..0000000 --- a/src/Service/GetTopNodesInterface.php +++ /dev/null @@ -1,22 +0,0 @@ -textBuilder = $textBuilder; - $this->stopWordFilter = $stopWordFilter; - } - - public function parse(string $rawText): TextInterface - { - $sentences = preg_split( - '/(\n+)|(\.\s|\?\s|\!\s)(?![^\(]*\))/', - $rawText, - -1, - PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE - ); - - $textMap = []; - - foreach ($sentences as $sentenceIndex => $sentence) { - $tokens = preg_split( - '/(?:(^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/', - $sentence, - -1, - PREG_SPLIT_NO_EMPTY - ); - - foreach ($tokens as $tokenIndex => $token) { - $token = mb_strtolower(trim($token)); - - if ($this->stopWordFilter->isStopWord($token)) { - unset($tokens[$tokenIndex]); - } else { - $tokens[$tokenIndex] = mb_strtolower(trim($token)); - } - } - - $textMap[$sentenceIndex] = $tokens; - } - - return $this->textBuilder->build( - $sentences, - $textMap - ); - } -} +textBuilder = $textBuilder; + $this->stopWordCollectionBuilder = $stopWordCollectionBuilder; + } + + public function parse(string $rawText, string $stopWordsPath): TextInterface + { + $stopWordCollection = $this + ->stopWordCollectionBuilder + ->build($stopWordsPath); + + $sentences = preg_split( + '/(\n+)|(\.\s|\?\s|\!\s)(?![^\(]*\))/', + $rawText, + -1, + PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE + ); + + $textMap = []; + + foreach ($sentences as $sentenceIndex => $sentence) { + $tokens = preg_split( + '/(?:(^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/', + $sentence, + -1, + PREG_SPLIT_NO_EMPTY + ); + + foreach ($tokens as $tokenIndex => $token) { + $token = mb_strtolower(trim($token)); + + if ($stopWordCollection->isExist($token)) { + unset($tokens[$tokenIndex]); + } else { + $tokens[$tokenIndex] = mb_strtolower(trim($token)); + } + } + + $textMap[$sentenceIndex] = $tokens; + } + + return $this->textBuilder->build( + $sentences, + $textMap + ); + } +} diff --git a/src/Service/ParserInterface.php b/src/Service/ParserInterface.php new file mode 100644 index 0000000..9361546 --- /dev/null +++ b/src/Service/ParserInterface.php @@ -0,0 +1,12 @@ +getId()] = $keyword->getRank(); - } - - $sentenceOutputList = []; - - foreach ($text->getSentences() as $sentence) { - $vector = $sentence->getVector(); - $score = .0; - - foreach ($vector as $tokenId) { - if (isset($keywordRankMap[$tokenId])) { - $score += $keywordRankMap[$tokenId]; - } - } - - $score = $score / count($vector); - $sentenceOutput = new OutputValue(); - $sentenceOutput->setId($sentence->getId()); - $sentenceOutput->setRank($score); - $sentenceOutput->setValue($sentence->getOriginalValue()); - - $sentenceOutputList[] = $sentenceOutput; - } - - return $sentenceOutputList; - } -} +createRankMap($nodeCollection); + $sentenceOutputList = []; + + foreach ($text->getSentences() as $sentence) { + $vector = $sentence->getVector(); + $weight = .0; + + foreach ($vector as $tokenId) { + $weight += $rankMap[$tokenId]; + } + + $weight = $weight / max(1, count($vector)); + + $sentenceOutputList[] = $this + ->createSentence( + $sentence->getId(), + $weight, + $sentence->getOriginalValue() + ); + } + + return $sentenceOutputList; + } + + private function createSentence( + int $id, + float $rank, + string $originalValue + ): RankDataObjectInterface { + + $sentence = new RankDataObject(); + $sentence->setId($id); + $sentence->setRank($rank); + $sentence->setValue($originalValue); + + return $sentence; + } + + private function createRankMap( + NodeCollectionInterface $nodeCollection + ): array { + $rankMap = []; + + foreach ($nodeCollection->getNodes() as $node) { + $rankMap[$node->getId()] = $node->getRank(); + } + + return $rankMap; + } +} diff --git a/src/Service/SentenceWeightingInterface.php b/src/Service/SentenceWeightingInterface.php index 7bd29c5..c41a669 100644 --- a/src/Service/SentenceWeightingInterface.php +++ b/src/Service/SentenceWeightingInterface.php @@ -1,19 +1,23 @@ -getRank() > $rankList[$j]->getRank()) { + $tmp = $rankList[$i]; + $rankList[$i] = $rankList[$j]; + $rankList[$j] = $tmp; + } + } + } + + return $rankList; + } +} diff --git a/src/Service/SortRankDataListInterface.php b/src/Service/SortRankDataListInterface.php new file mode 100644 index 0000000..929c056 --- /dev/null +++ b/src/Service/SortRankDataListInterface.php @@ -0,0 +1,18 @@ +stopWordCollection = $stopWordCollection; - } - - public function isStopWord(string $word): bool - { - return array_search($word, $this->stopWordCollection->words) !== false; - } -} diff --git a/src/Strategy/PageRankStrategy.php b/src/Strategy/PageRankStrategy.php index 20bab66..89ba665 100644 --- a/src/Strategy/PageRankStrategy.php +++ b/src/Strategy/PageRankStrategy.php @@ -1,81 +1,82 @@ -pageRankDataSourceBuilder = $pageRankDataSourceBuilder; - } - - public function rank(TextInterface $text): NodeCollectionInterface - { - $dataSource = $this->pageRankDataSourceBuilder->build($text); - $strategy = $this->createPageRankStrategy($dataSource); - $ranking = $this->createRanking($strategy); - $pageRankAlgorithm = $this->createPageRankAlgorithm($ranking, $strategy); - $maxIteration = 1000; - - return $pageRankAlgorithm->run($maxIteration); - } - - private function createPageRankAlgorithm( - RankingInterface $ranking, - NodeDataSourceStrategyInterface $strategy - ): PageRankAlgorithmInterface { - - $normalizer = new Normalizer(); - - return new PageRankAlgorithm( - $ranking, - $strategy, - $normalizer - ); - } - - private function createPageRankStrategy( - array $dataSource - ): NodeDataSourceStrategyInterface { - - $nodeBuilder = new NodeBuilder(); - $nodeCollectionBuilder = new NodeCollectionBuilder(); - - return new MemorySourceStrategy( - $nodeBuilder, - $nodeCollectionBuilder, - $dataSource - ); - } - - private function createRanking( - NodeDataSourceStrategyInterface $strategy - ): RankingInterface { - - $rankComparator = new RankComparator(); - - return new Ranking( - $rankComparator, - $strategy - ); - } -} +pageRankDataSourceBuilder = $pageRankDataSourceBuilder; + } + + public function rank( + TextInterface $text, + int $iteration + ): NodeCollectionInterface { + $dataSource = $this->pageRankDataSourceBuilder->build($text); + $strategy = $this->createPageRankStrategy($dataSource); + $ranking = $this->createRanking($strategy); + $pageRankAlgorithm = $this->createPageRankAlgorithm($ranking, $strategy); + + return $pageRankAlgorithm->run($iteration); + } + + private function createPageRankAlgorithm( + RankingInterface $ranking, + NodeDataSourceStrategyInterface $strategy + ): PageRankAlgorithmInterface { + + $normalizer = new Normalizer(); + + return new PageRankAlgorithm( + $ranking, + $strategy, + $normalizer + ); + } + + private function createPageRankStrategy( + array $dataSource + ): NodeDataSourceStrategyInterface { + + $nodeBuilder = new NodeBuilder(); + $nodeCollectionBuilder = new NodeCollectionBuilder(); + + return new MemorySourceStrategy( + $nodeBuilder, + $nodeCollectionBuilder, + $dataSource + ); + } + + private function createRanking( + NodeDataSourceStrategyInterface $strategy + ): RankingInterface { + + $rankComparator = new RankComparator(); + + return new Ranking( + $rankComparator, + $strategy + ); + } +} diff --git a/src/Strategy/RankingAlgorithmStrategyInterface.php b/src/Strategy/RankingAlgorithmStrategyInterface.php index 3387864..e771d72 100644 --- a/src/Strategy/RankingAlgorithmStrategyInterface.php +++ b/src/Strategy/RankingAlgorithmStrategyInterface.php @@ -1,13 +1,16 @@ - - * $stopWords = new English(); - * - * $textRank = new TextRankFacade(); - * $textRank->setStopWords($stopWords); - * - * $sentences = $textRank->summarizeTextFreely( - * $rawText, - * 5, - * 2, - * Summarize::GET_ALL_IMPORTANT - * ); - * - * - * @package PhpScience\TextRank + * @deprecated Use PhpScience\TextRank\Facade\TextRank instead. */ class TextRankFacade { /** - * Stop Words - * - * Stop Words to ignore because of dummy words. These words will not be Key - * Words. A, like, no yes, one, two, I, you for example. - * - * @see \PhpScience\TextRank\Tool\StopWords\English - * * @var StopWordsAbstract */ - protected $stopWords; + protected StopWordsAbstract $stopWords; /** - * Set Stop Words. - * - * @param StopWordsAbstract $stopWords Stop Words to ignore because of - * dummy words. + * @param StopWordsAbstract $stopWords */ public function setStopWords(StopWordsAbstract $stopWords) { From e58ff6b0aea54510c502a08e3f1afea215ee9ad8 Mon Sep 17 00:00:00 2001 From: David Belicza <87.bdavid@gmail.com> Date: Sat, 26 Sep 2020 19:45:01 +0200 Subject: [PATCH 5/5] Old code has been removed, Sorting algorithm has been changed, Workflow refactored, Smaller bugfixes --- src/Builder/AlgorithmOutputBuilder.php | 6 +- src/Data/AlgorithmRequest.php | 21 +- src/Data/AlgorithmRequestInterface.php | 10 + src/Facade/TextRank.php | 3 +- src/Service/Parser.php | 21 +- src/Service/ParserInterface.php | 6 +- src/Service/SortRankDataList.php | 29 +- src/TextRankFacade.php | 235 -------- src/Tool/Graph.php | 83 --- src/Tool/Parser.php | 213 ------- src/Tool/Score.php | 176 ------ src/Tool/StopWords/English.php | 334 ---------- src/Tool/StopWords/French.php | 706 ---------------------- src/Tool/StopWords/German.php | 616 ------------------- src/Tool/StopWords/Italian.php | 676 --------------------- src/Tool/StopWords/Norwegian.php | 238 -------- src/Tool/StopWords/Russian.php | 575 ------------------ src/Tool/StopWords/Spanish.php | 738 ----------------------- src/Tool/StopWords/StopWordsAbstract.php | 27 - src/Tool/Summarize.php | 224 ------- src/Tool/Text.php | 99 --- tests/functional/TextRankFacadeTest.php | 151 ----- 22 files changed, 74 insertions(+), 5113 deletions(-) delete mode 100644 src/TextRankFacade.php delete mode 100644 src/Tool/Graph.php delete mode 100644 src/Tool/Parser.php delete mode 100644 src/Tool/Score.php delete mode 100644 src/Tool/StopWords/English.php delete mode 100644 src/Tool/StopWords/French.php delete mode 100644 src/Tool/StopWords/German.php delete mode 100644 src/Tool/StopWords/Italian.php delete mode 100644 src/Tool/StopWords/Norwegian.php delete mode 100644 src/Tool/StopWords/Russian.php delete mode 100644 src/Tool/StopWords/Spanish.php delete mode 100644 src/Tool/StopWords/StopWordsAbstract.php delete mode 100644 src/Tool/Summarize.php delete mode 100644 src/Tool/Text.php delete mode 100644 tests/functional/TextRankFacadeTest.php diff --git a/src/Builder/AlgorithmOutputBuilder.php b/src/Builder/AlgorithmOutputBuilder.php index ecf11dd..a4f2b25 100644 --- a/src/Builder/AlgorithmOutputBuilder.php +++ b/src/Builder/AlgorithmOutputBuilder.php @@ -29,7 +29,11 @@ public function build( int $maxSentences ): AlgorithmOutputInterface { $words = $this->createWordList($text, $nodeCollection, $maxKeywords); - $sentences = $this->sortRankDataList->sort($sentences); + $sentences = array_slice( + $this->sortRankDataList->sort($sentences), + 0, + $maxSentences + ); $textRankOutput = new AlgorithmOutput(); $textRankOutput->setKeyWords($words); diff --git a/src/Data/AlgorithmRequest.php b/src/Data/AlgorithmRequest.php index 74e62dc..dc525ee 100644 --- a/src/Data/AlgorithmRequest.php +++ b/src/Data/AlgorithmRequest.php @@ -8,17 +8,20 @@ class AlgorithmRequest implements AlgorithmRequestInterface { private string $stopWordCsvPath; private string $rawText; + private int $minKeywordLength; private int $maxKeywords; private int $maxKeySentences; private int $pageRankPowerIteration; public function __construct( string $stopWordCsvPath = __DIR__ . '/../resource/stop-word/english.csv', + int $minKeywordLength = 3, int $maxKeywords = 10, int $maxKeySentences = 5, int $pageRankPowerIteration = 10 ) { $this->stopWordCsvPath = $stopWordCsvPath; + $this->minKeywordLength = $minKeywordLength; $this->maxKeywords = $maxKeywords; $this->maxKeySentences = $maxKeySentences; $this->pageRankPowerIteration = $pageRankPowerIteration; @@ -34,6 +37,16 @@ public function setStopWordCsvPath(string $stopWordCsvPath): void $this->stopWordCsvPath = $stopWordCsvPath; } + public function getMinKeywordLength(): int + { + return $this->minKeywordLength; + } + + public function setMinKeywordLength(int $minKeywordLength): void + { + $this->minKeywordLength = $minKeywordLength; + } + public function getRawText(): string { return $this->rawText; @@ -64,13 +77,13 @@ public function setMaxKeySentences(int $maxKeySentences): void $this->maxKeySentences = $maxKeySentences; } - public function getPageRankPowerIteration(): int + public function setPageRankPowerIteration(int $pageRankPowerIteration): void { - return $this->pageRankPowerIteration; + $this->pageRankPowerIteration = $pageRankPowerIteration; } - public function setPageRankPowerIteration(int $pageRankPowerIteration): void + public function getPageRankPowerIteration(): int { - $this->pageRankPowerIteration = $pageRankPowerIteration; + return $this->pageRankPowerIteration; } } diff --git a/src/Data/AlgorithmRequestInterface.php b/src/Data/AlgorithmRequestInterface.php index a6e0aa1..3da182a 100644 --- a/src/Data/AlgorithmRequestInterface.php +++ b/src/Data/AlgorithmRequestInterface.php @@ -16,6 +16,16 @@ public function getStopWordCsvPath(): string; */ public function setStopWordCsvPath(string $stopWordCsvPath): void; + /** + * @return int + */ + public function getMinKeywordLength(): int; + + /** + * @param int $minKeywordLength + */ + public function setMinKeywordLength(int $minKeywordLength): void; + /** * @return string */ diff --git a/src/Facade/TextRank.php b/src/Facade/TextRank.php index 14f536f..f81a511 100644 --- a/src/Facade/TextRank.php +++ b/src/Facade/TextRank.php @@ -34,7 +34,8 @@ public function rank( $text = $this->parser->parse( $algorithmRequest->getRawText(), - $algorithmRequest->getStopWordCsvPath() + $algorithmRequest->getStopWordCsvPath(), + $algorithmRequest->getMinKeywordLength() ); $nodeCollection = $this->pageRankAlgorithm->rank( diff --git a/src/Service/Parser.php b/src/Service/Parser.php index 36c0321..59d51b5 100644 --- a/src/Service/Parser.php +++ b/src/Service/Parser.php @@ -21,8 +21,11 @@ public function __construct( $this->stopWordCollectionBuilder = $stopWordCollectionBuilder; } - public function parse(string $rawText, string $stopWordsPath): TextInterface - { + public function parse( + string $rawText, + string $stopWordsPath, + int $minimumTokenLength + ): TextInterface { $stopWordCollection = $this ->stopWordCollectionBuilder ->build($stopWordsPath); @@ -34,6 +37,12 @@ public function parse(string $rawText, string $stopWordsPath): TextInterface PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE ); + foreach ($sentences as $sentenceIndex => $sentence) { + if (1 === strlen(trim($sentence))) { + unset($sentences[$sentenceIndex]); + } + } + $textMap = []; foreach ($sentences as $sentenceIndex => $sentence) { @@ -47,10 +56,14 @@ public function parse(string $rawText, string $stopWordsPath): TextInterface foreach ($tokens as $tokenIndex => $token) { $token = mb_strtolower(trim($token)); - if ($stopWordCollection->isExist($token)) { + if ( + ctype_punct($token) + || mb_strlen($token) < $minimumTokenLength + || $stopWordCollection->isExist($token) + ) { unset($tokens[$tokenIndex]); } else { - $tokens[$tokenIndex] = mb_strtolower(trim($token)); + $tokens[$tokenIndex] = $token; } } diff --git a/src/Service/ParserInterface.php b/src/Service/ParserInterface.php index 9361546..b432975 100644 --- a/src/Service/ParserInterface.php +++ b/src/Service/ParserInterface.php @@ -8,5 +8,9 @@ interface ParserInterface { - public function parse(string $rawText, string $stopWordsPath): TextInterface; + public function parse( + string $rawText, + string $stopWordsPath, + int $minimumTokenLength + ): TextInterface; } diff --git a/src/Service/SortRankDataList.php b/src/Service/SortRankDataList.php index db557cc..2273597 100644 --- a/src/Service/SortRankDataList.php +++ b/src/Service/SortRankDataList.php @@ -11,18 +11,25 @@ class SortRankDataList implements SortRankDataListInterface */ public function sort(array $rankList): array { - $size = count($rankList); - - for ($i = 0; $i < $size; $i++) { - for ($j = 0; $j < $size; $j++) { - if ($rankList[$i]->getRank() > $rankList[$j]->getRank()) { - $tmp = $rankList[$i]; - $rankList[$i] = $rankList[$j]; - $rankList[$j] = $tmp; - } - } + $rankIndex = $this->getIndexedRank($rankList); + arsort($rankIndex); + $rankCollection = []; + + foreach ($rankIndex as $index => $rank) { + $rankCollection[] = $rankList[$index]; + } + + return $rankCollection; + } + + private function getIndexedRank(array $rankList): array + { + $rankIndex = []; + + foreach ($rankList as $index => $rankObject) { + $rankIndex[$index] = $rankObject->getRank(); } - return $rankList; + return $rankIndex; } } diff --git a/src/TextRankFacade.php b/src/TextRankFacade.php deleted file mode 100644 index 20ddd34..0000000 --- a/src/TextRankFacade.php +++ /dev/null @@ -1,235 +0,0 @@ -stopWords = $stopWords; - } - - /** - * Only Keywords - * - * It retrieves the possible keywords with their scores from a text. - * - * @param string $rawText A single raw text. - * - * @return array Array from Keywords. Key is the parsed word, value is the - * word score. - */ - public function getOnlyKeyWords(string $rawText): array - { - $parser = new Parser(); - $parser->setMinimumWordLength(3); - $parser->setRawText($rawText); - - if ($this->stopWords) { - $parser->setStopWords($this->stopWords); - } - - $text = $parser->parse(); - - $graph = new Graph(); - $graph->createGraph($text); - - $score = new Score(); - - return $score->calculate( - $graph, $text - ); - } - - /** - * Highlighted Texts - * - * It finds the most important sentences from a text by the most important - * keywords and these keywords also found by automatically. It retrieves - * the most important sentences what are 20 percent of the full text. - * - * @param string $rawText A single raw text. - * - * @return array An array from sentences. - */ - public function getHighlights(string $rawText): array - { - $parser = new Parser(); - $parser->setMinimumWordLength(3); - $parser->setRawText($rawText); - - if ($this->stopWords) { - $parser->setStopWords($this->stopWords); - } - - $text = $parser->parse(); - $maximumSentences = (int) (count($text->getSentences()) * 0.2); - - $graph = new Graph(); - $graph->createGraph($text); - - $score = new Score(); - $scores = $score->calculate($graph, $text); - - $summarize = new Summarize(); - - return $summarize->getSummarize( - $scores, - $graph, - $text, - 12, - $maximumSentences, - Summarize::GET_ALL_IMPORTANT - ); - } - - /** - * Compounds a Summarized Text - * - * It finds the three most important sentences from a text by the most - * important keywords and these keywords also found by automatically. It - * retrieves these important sentences. - * - * @param string $rawText A single raw text. - * - * @return array An array from sentences. - */ - public function summarizeTextCompound(string $rawText): array - { - $parser = new Parser(); - $parser->setMinimumWordLength(3); - $parser->setRawText($rawText); - - if ($this->stopWords) { - $parser->setStopWords($this->stopWords); - } - - $text = $parser->parse(); - - $graph = new Graph(); - $graph->createGraph($text); - - $score = new Score(); - $scores = $score->calculate($graph, $text); - - $summarize = new Summarize(); - - return $summarize->getSummarize( - $scores, - $graph, - $text, - 10, - 3, - Summarize::GET_ALL_IMPORTANT - ); - } - - /** - * Summarized Text - * - * It finds the most important sentence from a text by the most important - * keywords and these keywords also found by automatically. It retrieves - * the most important sentence and its following sentences. - * - * @param string $rawText A single raw text. - * - * @return array An array from sentences. - */ - public function summarizeTextBasic(string $rawText): array - { - $parser = new Parser(); - $parser->setMinimumWordLength(3); - $parser->setRawText($rawText); - - if ($this->stopWords) { - $parser->setStopWords($this->stopWords); - } - - $text = $parser->parse(); - - $graph = new Graph(); - $graph->createGraph($text); - - $score = new Score(); - $scores = $score->calculate($graph, $text); - - $summarize = new Summarize(); - - return $summarize->getSummarize( - $scores, - $graph, - $text, - 10, - 3, - Summarize::GET_FIRST_IMPORTANT_AND_FOLLOWINGS - ); - } - - /** - * Freely Summarized Text. - * - * It retrieves the most important sentences from a text by the most important - * keywords and these keywords also found by automatically. - * - * @param string $rawText A single raw text. - * @param int $analyzedKeyWords Maximum number of the most important - * Key Words to analyze the text. - * @param int $expectedSentences How many sentence should be retrieved. - * @param int $summarizeType Highlights from the text or a part of - * the text. - * - * @return array An array from sentences. - */ - public function summarizeTextFreely( - string $rawText, - int $analyzedKeyWords, - int $expectedSentences, - int $summarizeType - ): array { - $parser = new Parser(); - $parser->setMinimumWordLength(3); - $parser->setRawText($rawText); - - if ($this->stopWords) { - $parser->setStopWords($this->stopWords); - } - - $text = $parser->parse(); - - $graph = new Graph(); - $graph->createGraph($text); - - $score = new Score(); - $scores = $score->calculate($graph, $text); - - $summarize = new Summarize(); - - return $summarize->getSummarize( - $scores, - $graph, - $text, - $analyzedKeyWords, - $expectedSentences, - $summarizeType - ); - } -} diff --git a/src/Tool/Graph.php b/src/Tool/Graph.php deleted file mode 100644 index 06cdbe9..0000000 --- a/src/Tool/Graph.php +++ /dev/null @@ -1,83 +0,0 @@ -getWordMatrix(); - - foreach ($wordMatrix as $sentenceIdx => $words) { - $idxArray = array_keys($words); - - foreach ($idxArray as $idxKey => $idxValue) { - $connections = []; - - if (isset($idxArray[$idxKey - 1])) { - $connections[] = $idxArray[$idxKey - 1]; - } - - if (isset($idxArray[$idxKey + 1])) { - $connections[] = $idxArray[$idxKey + 1]; - } - - $this->graph[$words[$idxValue]][$sentenceIdx][$idxValue] = $connections; - } - } - } - - /** - * Graph. - * - * It retrieves the graph. Key is the word, value is an array with the - * sentence IDs. - * - * - * array( - * 'apple' => array( // word - * 2 => array( // ID of the sentence - * 52 => array( // ID of the word in the sentence - * 51, 53 // IDs of the closest words to the apple word - * ), - * 10 => array( // IDs of the closest words to the apple word - * 9, 11 // IDs of the closest words to the apple word - * ), - * 5 => array(6) - * ), - * 6 => array( - * 9 => array(8, 10) - * ), - * ), - * 'orange' => array( - * 1 => array( - * 30 => array(29, 31) - * ) - * ) - * ); - * - * - * @return array - */ - public function getGraph(): array - { - return $this->graph; - } -} diff --git a/src/Tool/Parser.php b/src/Tool/Parser.php deleted file mode 100644 index 76829be..0000000 --- a/src/Tool/Parser.php +++ /dev/null @@ -1,213 +0,0 @@ -minimumWordLength = $wordLength; - } - - /** - * It sets the raw text. - * - * @param string $rawText - */ - public function setRawText(string $rawText) - { - $this->rawText = $rawText; - } - - /** - * Set Stop Words. - * - * It sets the stop words to remove them from the found keywords. - * - * @param StopWordsAbstract $words Stop Words to ignore. These words will - * not be keywords. - */ - public function setStopWords(StopWordsAbstract $words) - { - $this->stopWords = $words; - } - - /** - * It retrieves the punctuations. - * - * @return array Array from punctuations where key is the index to link to - * the sentence and value is the punctuation. - */ - public function getMarks(): array - { - return $this->marks; - } - - /** - * Parse. - * - * It parses the text from the property and retrieves in Text object - * prepared to scoring and to searching. - * - * @return Text Parsed text prepared to scoring. - */ - public function parse(): Text - { - $matrix = []; - $sentences = $this->getSentences(); - - foreach ($sentences as $sentenceIdx => $sentence) { - $matrix[$sentenceIdx] = $this->getWords($sentence); - } - - $text = new Text(); - $text->setSentences($sentences); - $text->setWordMatrix($matrix); - $text->setMarks($this->marks); - - return $text; - } - - /** - * Sentences. - * - * It retrieves the sentences in array without junk data. - * - * @return array Array from sentences. - */ - protected function getSentences(): array - { - $sentences = $sentences = preg_split( - '/(\n+)|(\.\s|\?\s|\!\s)(?![^\(]*\))/', - $this->rawText, - -1, - PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE - ); - - return array_values( - array_filter( - array_map( - [$this, 'cleanSentence'], - $sentences - ) - ) - ); - } - - /** - * Possible Keywords. - * - * It retrieves an array of possible keywords without junk characters, - * spaces and stop words. - * - * @param string $subText It should be a sentence. - * - * @return array The array of the possible keywords. - */ - protected function getWords(string $subText): array - { - $words = preg_split( - '/(?:(^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/', - $subText, - -1, - PREG_SPLIT_NO_EMPTY - ); - - $words = array_values( - array_filter( - array_map( - [$this, 'cleanWord'], - $words - ) - ) - ); - - if ($this->stopWords) { - return array_filter($words, function($word) { - return !ctype_punct($word) - && strlen($word) > $this->minimumWordLength - && !$this->stopWords->exist($word); - }); - } else { - return array_filter($words, function($word) { - return !ctype_punct($word) - && strlen($word) > $this->minimumWordLength; - }); - } - } - - /** - * Clean Sentence. - * - * It clean the sentence. If it is a punctuation it will be stored in the - * property $marks. - * - * @param string $sentence A sentence as a string. - * - * @return string It is empty string when it's punctuation. Otherwise it's - * the trimmed sentence itself. - */ - protected function cleanSentence(string $sentence): string - { - if (strlen(trim($sentence)) == 1) { - $this->marks[] = trim($sentence); - return ''; - - } else { - return trim($sentence); - } - } - - /** - * Clean Word. - * - * It removes the junk spaces from the word and retrieves it. - * - * @param string $word - * - * @return string Cleaned word. - */ - protected function cleanWord(string $word): string - { - return mb_strtolower(trim($word)); - } -} diff --git a/src/Tool/Score.php b/src/Tool/Score.php deleted file mode 100644 index f28b2c3..0000000 --- a/src/Tool/Score.php +++ /dev/null @@ -1,176 +0,0 @@ -getGraph(); - $wordMatrix = $text->getWordMatrix(); - $wordConnections = $this->calculateConnectionNumbers($graphData); - $scores = $this->calculateScores( - $graphData, - $wordMatrix, - $wordConnections - ); - - return $this->normalizeAndSortScores($scores); - } - - /** - * Connection Numbers. - * - * It calculates the number of connections for each word and retrieves it - * in array where key is the word and value is the number of connections. - * - * @param array $graphData Graph data from a Graph type object. - * - * @return array Key is the word and value is the number of the connected - * words. - */ - protected function calculateConnectionNumbers(array &$graphData): array - { - $wordConnections = []; - - foreach ($graphData as $wordKey => $sentences) { - $connectionCount = 0; - - foreach ($sentences as $sentenceIdx => $wordInstances) { - foreach ($wordInstances as $connections) { - $connectionCount += count($connections); - } - } - - $wordConnections[$wordKey] = $connectionCount; - } - - return $wordConnections; - } - - /** - * Calculate Scores. - * - * It calculates the score of the words and retrieves it in array where key - * is the word and value is the score. The score depends on the number of - * the connections and the closest word's connection numbers. - * - * @param array $graphData Graph data from a Graph type object. - * @param array $wordMatrix Multidimensional array from integer keys - * and string values. - * @param array $wordConnections Key is the word and value is the number of - * the connected words. - * - * @return array Scores where key is the word and value is the score. - */ - protected function calculateScores( - array &$graphData, - array &$wordMatrix, - array &$wordConnections - ): array { - $scores = []; - - foreach ($graphData as $wordKey => $sentences) { - $value = 0; - - foreach ($sentences as $sentenceIdx => $wordInstances) { - foreach ($wordInstances as $connections) { - foreach ($connections as $wordIdx) { - $word = $wordMatrix[$sentenceIdx][$wordIdx]; - $value += $wordConnections[$word]; - } - } - } - - $scores[$wordKey] = $value; - - if ($value > $this->maximumValue) { - $this->maximumValue = $value; - } - - if ($value < $this->minimumValue || $this->minimumValue == 0) { - $this->minimumValue = $value; - } - } - - return $scores; - } - - /** - * Normalize and Sort Scores. - * - * It recalculates the scores by normalize the score numbers to between 0 - * and 1. - * - * @param array $scores Keywords with scores. Score is the key. - * - * @return array Keywords with normalized and ordered scores. - */ - protected function normalizeAndSortScores(array &$scores): array - { - foreach ($scores as $key => $value) { - $v = $this->normalize( - $value, - $this->minimumValue, - $this->maximumValue - ); - - $scores[$key] = $v; - } - - arsort($scores); - - return $scores; - } - - /** - * It normalizes a number. - * - * @param int $value Current weight. - * @param int $min Minimum weight. - * @param int $max Maximum weight. - * - * @return float|int Normalized weight aka score. - */ - protected function normalize(int $value, int $min, int $max): float - { - $divisor = $max - $min; - - if ($divisor == 0) { - return 0.0; - } - - $normalized = ($value - $min) / $divisor; - - return $normalized; - } -} diff --git a/src/Tool/StopWords/English.php b/src/Tool/StopWords/English.php deleted file mode 100644 index 4b08b8b..0000000 --- a/src/Tool/StopWords/English.php +++ /dev/null @@ -1,334 +0,0 @@ -words) !== false; - } -} diff --git a/src/Tool/Summarize.php b/src/Tool/Summarize.php deleted file mode 100644 index fd4fed4..0000000 --- a/src/Tool/Summarize.php +++ /dev/null @@ -1,224 +0,0 @@ -getGraph(); - $sentences = $text->getSentences(); - $marks = $text->getMarks(); - $this->findAndWeightSentences($scores, $graphData, $keyWordLimit); - - if ($type == Summarize::GET_ALL_IMPORTANT) { - return $this->getAllImportant($sentences, $marks, $sentenceLimit); - - } else if ($type == Summarize::GET_FIRST_IMPORTANT_AND_FOLLOWINGS) { - return $this->getFirstImportantAndFollowings( - $sentences, - $marks, - $sentenceLimit - ); - } - - return []; - } - - /** - * Find and Weight Sentences. - * - * It finds the most important sentences and stores them into the property. - * - * @param array $scores Keywords with scores. Score is the key. - * @param array $graphData Graph data from a Graph type object. - * @param int $keyWordLimit How many keyword should be used to find the - * important sentences. - */ - protected function findAndWeightSentences( - array &$scores, - array &$graphData, - int $keyWordLimit - ) { - $i = 0; - - foreach ($scores as $word => $score) { - if ($i >= $keyWordLimit) { - break; - } - - $i++; - $wordMap = $graphData[$word]; - - foreach ($wordMap as $key => $value) { - $this->updateSentenceWeight($key); - } - } - - arsort($this->sentenceWeight); - } - - /** - * Important Sentences. - * - * It retrieves the important sentences. - * - * @param array $sentences Sentences, ordered by weights. - * @param array $marks Array of punctuations. Key is the reference - * to the sentence, value is the punctuation. - * @param int $sentenceLimit How many sentence should be retrieved. - * - * @return array An array from sentences what are the most important - * sentences. - */ - protected function getAllImportant( - array &$sentences, - array &$marks, - int $sentenceLimit - ): array { - - $summary = []; - $i = 0; - - foreach ($this->sentenceWeight as $sentenceIdx => $weight) { - if ($i >= $sentenceLimit) { - break; - } - - $i++; - $summary[$sentenceIdx] = $sentences[$sentenceIdx] - . $this->getMark($marks, $sentenceIdx); - } - - ksort($summary); - - return $summary; - } - - /** - * Most Important Sentence and Next. - * - * It retrieves the first most important sentence and its following - * sentences. - * - * @param array $sentences Sentences, ordered by weights. - * @param array $marks Array of punctuations. Key is the reference - * to the sentence, value is the punctuation. - * @param int $sentenceLimit How many sentence should be retrieved. - * - * @return array An array from sentences what contains the most important - * sentence and its following sentences. - */ - protected function getFirstImportantAndFollowings( - array &$sentences, - array &$marks, - int $sentenceLimit - ): array { - - $summary = []; - $startIdx = 0; - - foreach ($this->sentenceWeight as $sentenceIdx => $weight) { - $summary[$sentenceIdx] = $sentences[$sentenceIdx] . - $this->getMark($marks, $sentenceIdx); - - $startIdx = $sentenceIdx; - break; - } - - $i = 0; - - foreach ($sentences as $sentenceIdx => $sentence) { - if ($sentenceIdx <= $startIdx) { - continue; - } else if ($i >= $sentenceLimit - 1) { - break; - } - - $i++; - $summary[$sentenceIdx] = $sentences[$sentenceIdx] . - $this->getMark($marks, $sentenceIdx); - } - - return $summary; - } - - /** - * Update Sentence Weight. - * - * It updates the sentence weight what is stored in the property. - * - * @param int $sentenceIdx Index of the sentence. - */ - protected function updateSentenceWeight(int $sentenceIdx) - { - if (isset($this->sentenceWeight[$sentenceIdx])) { - $this->sentenceWeight[$sentenceIdx] = $this->sentenceWeight[$sentenceIdx] + 1; - } else { - $this->sentenceWeight[$sentenceIdx] = 1; - } - } - - /** - * Punctuations. - * - * It retrieves the punctuation of the sentence. - * - * @param array $marks The punctuation. Key is the reference to the - * sentence, value is the punctuation. - * @param int $idx Key of the punctuation. - * - * @return string The punctuation of the sentence. - */ - protected function getMark(array &$marks, int $idx) - { - return isset($marks[$idx]) ? $marks[$idx] : ''; - } -} diff --git a/src/Tool/Text.php b/src/Tool/Text.php deleted file mode 100644 index 458fc04..0000000 --- a/src/Tool/Text.php +++ /dev/null @@ -1,99 +0,0 @@ -wordMatrix = $wordMatrix; - } - - /** - * It sets the sentences. - * - * @param array $sentences Array's key should be an int and value should be - * string. - */ - public function setSentences(array $sentences) - { - $this->sentences = $sentences; - } - - /** - * It set the punctuations to the property. - * - * @param array $marks Array's key should be an int and value should be - * string. - */ - public function setMarks(array $marks) - { - $this->marks = $marks; - } - - /** - * It retrieves the words in sentence groups. - * - * @return array Multidimensional array from words of the text. Key is - * index of the sentence, value is an array from words - * where key is the index of the word and value is the word. - */ - public function getWordMatrix(): array - { - return $this->wordMatrix; - } - - /** - * It retrieves the sentences. - * - * @return array Array from sentences where key is the index and value is - * the sentence. - */ - public function getSentences(): array - { - return $this->sentences; - } - - /** - * It retrieves the punctuations. - * - * @return array Array from punctuations where key is the index to link to - * the sentence and value is the punctuation. - */ - public function getMarks(): array - { - return $this->marks; - } -} diff --git a/tests/functional/TextRankFacadeTest.php b/tests/functional/TextRankFacadeTest.php deleted file mode 100644 index 41151ac..0000000 --- a/tests/functional/TextRankFacadeTest.php +++ /dev/null @@ -1,151 +0,0 @@ -sampleText1 = fread($file, filesize($path)); - - fclose($file); - } - - public function testGetOnlyKeyWords() - { - $api = new TextRankFacade(); - $stopWords = new English(); - $api->setStopWords($stopWords); - - $result = $api->getOnlyKeyWords($this->sampleText1); - - $this->assertTrue(count($result) > 0); - $this->assertTrue(array_values($result)[0] == 1); - } - - public function testGetHighlights() - { - $api = new TextRankFacade(); - $stopWords = new English(); - $api->setStopWords($stopWords); - - $result = $api->getHighlights($this->sampleText1); - - $this->assertTrue(count($result) > 0); - } - - public function testSummarizeTextCompound() - { - $api = new TextRankFacade(); - $stopWords = new English(); - $api->setStopWords($stopWords); - - $result = $api->summarizeTextCompound($this->sampleText1); - - $this->assertTrue(count($result) > 0); - } - - public function testSummarizeTextBasic() - { - $api = new TextRankFacade(); - $stopWords = new English(); - $api->setStopWords($stopWords); - - $result = $api->summarizeTextBasic($this->sampleText1); - - $this->assertTrue(count($result) > 0); - } - - public function testSummarizeTextFreely() - { - $api = new TextRankFacade(); - $stopWords = new English(); - $api->setStopWords($stopWords); - - $result = $api->summarizeTextFreely( - $this->sampleText1, - 5, - 2, - Summarize::GET_ALL_IMPORTANT - ); - - $this->assertTrue(count($result) == 2); - - $result = $api->summarizeTextFreely( - $this->sampleText1, - 10, - 1, - Summarize::GET_FIRST_IMPORTANT_AND_FOLLOWINGS - ); - - $this->assertTrue(count($result) == 1); - - // Stop words. - $result = $api->summarizeTextFreely( - 'one two. one two. three four.', - 2, - 10, - Summarize::GET_ALL_IMPORTANT - ); - - $this->assertTrue(count($result) == 0); - - // Less sentences then expected. - $result = $api->summarizeTextFreely( - 'lorem ipsum. lorem holy ipsum. sit dolor amet.', - 2, - 10, - Summarize::GET_ALL_IMPORTANT - ); - - $this->assertTrue(count($result) == 2); - } - - public function testSmallText() - { - $api = new TextRankFacade(); - $stopWords = new English(); - $api->setStopWords($stopWords); - - $result = $api->getOnlyKeyWords('lorem ipsum sit'); - - $this->assertEquals(2, count($result)); - - $result = $api->getOnlyKeyWords('sit'); - - $this->assertEquals(0, count($result)); - - $result = $api->getOnlyKeyWords(''); - - $this->assertEquals(0, count($result)); - } - - public function testSmallTextRu() - { - $api = new TextRankFacade(); - $stopWords = new Russian(); - $api->setStopWords($stopWords); - $result = $api->getOnlyKeyWords('между холодными ладонями'); - $this->assertCount(2, $result); - - $result = $api->getOnlyKeyWords('конец'); - $this->assertCount(0, $result); - - $result = $api->getOnlyKeyWords(''); - $this->assertCount(0, $result); - } -}