Skip to content

Commit

Permalink
Extract Top K elements from data
Browse files Browse the repository at this point in the history
  • Loading branch information
robosherpa committed Apr 12, 2019
1 parent 0e8ce4d commit daf5145
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 37 deletions.
18 changes: 15 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 2.6)
project(cryptopangrams)
project(sample-code)

# Locate GTest
find_package(GTest REQUIRED)
Expand All @@ -11,12 +11,24 @@ include_directories(

# Link executables
add_executable(
cryptopangramsTests
sampleCryptoPangramsTest
tests/cryptopangrams_test.cpp
)

target_link_libraries(
cryptopangramsTests
sampleCryptoPangramsTest
${GTEST_LIBRARIES}
pthread
)

# Link executables
add_executable(
sampleTopKTest
tests/topK_test.cpp
)

target_link_libraries(
sampleTopKTest
${GTEST_LIBRARIES}
pthread
)
48 changes: 28 additions & 20 deletions README.MD
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
# Problem
# Problem #1

Given a vector in input data, return to TopK most frequently read data in the input stream. Make appropriate assumptions.

## Input
The input could be a vector of integer, floating point numbers or characters or strings.

## Output
A vector of top K most repeating input data

# Problem #2

On the Code Jam team, we enjoy sending each other pangrams, which are phrases that use each letter of the English alphabet at least once. One common example of a pangram is "the quick brown fox jumps over the lazy dog". Sometimes our pangrams contain confidential information — for example, CJ QUIZ: KNOW BEVY OF DP FLUX ALGORITHMS — so we need to keep them secure.

Expand All @@ -13,44 +23,42 @@ For example, suppose that N = 103 and we chose to use the first 26 odd prime num

We will give you a ciphertext message and the value of N that we used. We will not tell you which primes we used, or how to decrypt the ciphertext. Do you think you can recover the plaintext anyway?

#Input
## Input
The first line of the input gives the number of test cases, T. T test cases follow; each test case consists of two lines. The first line contains two integers: N, as described above, and L, the length of the list of values in the ciphertext. The second line contains L integers: the list of values in the ciphertext.

#Output
## Output
For each test case, output one line containing Case #x: y, where x is the test case number (starting from 1) and y is a string of L + 1 uppercase English alphabet letters: the plaintext.

#Limits
## Limits
1 ≤ T ≤ 100.

Time limit: 20 seconds per test set.

Memory limit: 1 GB.

25 ≤ L ≤ 100.

The plaintext contains each English alphabet letter at least once.

Test set 1 (Visible)

101 ≤ N ≤ 10000.

Test set 2 (Hidden)
101 ≤ N ≤ 10100.

#Sample Input

10000 25
3292937 175597 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 5335543/2411*7919 7919*7919

# Test Cases
101 ≤ N ≤ 10100.

## Sample Input
10000 26

3292937 175597 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 17524747 62710561

103 31

217 1891 4819 2291 2987 3811 1739 2491 4717 445 65 1079 8383 5353 901 187 649 1003 697 3239 7663 291 123 779 1007 3551 1943 2117 1679 989 3053
103 32
217 961 1891 4819 2291 2987 3811 1739 2491 4717 445 65 1079 8383 5353 901 187 649 1003 697 3239 7663 291 123 779 1007 3551 1943 2117 1679 989 3053
10000 25
3292937 175597 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 5335543
10000 28
3292937 3892729 175597 7921 7921 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 5335543

# Sample Output

SUBDERMATOGLYPHICFJKNQVWXZZ

Example Output
SUUUUUUUUUUUUUUUUUUBBBDERMATOGLYPHICFJKNQVWXZZZZZZ
ZUUUUUUUUUUUUUUUUUUBBBDERMATOGLYPHICFJKNQVWXZZZZZZZ
CJQUIZKNOWBEVYOFDPFLUXALGORITHMS
14 changes: 7 additions & 7 deletions src/cryptopangrams.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#include <string>
#include <iostream>
#include <vector>
#include <utility>
#include <chrono>
#include <cmath>
#include <limits>
#include <deque>
#include <algorithm>
#include <deque>
#include <limits>
#include <iostream>
#include <sstream>
#include <set>
#include <string>
#include <thread>
#include <chrono>
#include <utility>
#include <vector>

namespace codejam
{
Expand Down
69 changes: 69 additions & 0 deletions src/topK.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include <algorithm>
#include <iostream>
#include <set>
#include <string>
#include <vector>

/**
* A function that compares second values for a pair of pairs
*
* @param pair1 First input pair.
* @param pair2 Second input pair.
* @return Returns true if second value of first input pair is larger
* than that of the second input pair
*/
template<class T>
bool sortbysecond(std::pair<T,int> pairOne, std::pair<T, int> pairTwo){
return (pairOne.second > pairTwo.second);
}

/**
* A function that returns topK values in a given data
*
* @param data Vector of data.
* @return Returns a vector of topK repeating values in the data
*/
template<class T>
std::vector<T> getTopK(const std::vector<T>& data, int topK){
std::vector<T> resultVector;
resultVector.clear();

// Invalid input: when data size is less than topK value
if(data.size() < topK) return resultVector;

// Sort Values and copy to resultVector
resultVector = data;
std::sort(resultVector.begin(), resultVector.end());

// Create a Vector of pairs for each individual value in input data and its total frequency
resultVector.erase(std::unique(resultVector.begin(), resultVector.end()), resultVector.end());
std::vector <std::pair<T,int> > dataCounter;
for( const auto& it : resultVector){
dataCounter.push_back(std::make_pair(it,0));
}

// Iterate over data to count number of repetitions
for( const auto& it : data)
{
for (auto& dataCounterIt : dataCounter)
{
if(it == dataCounterIt.first)
dataCounterIt.second++;
}
}

// Sort the pair vector in by number of repetitions (TopK)
std::sort(dataCounter.begin(), dataCounter.end(), sortbysecond<T>);

// Store topK values in resultVector and return
resultVector.clear();
int topCounter = 0;
for (const auto& dataCounterIt : dataCounter)
{
resultVector.push_back(dataCounterIt.first);
topCounter++;
if(topCounter > topK) break;
}

return resultVector;
}
31 changes: 24 additions & 7 deletions tests/cryptopangrams_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,45 @@ TEST (CryptoPangrams, InvalidSizeL){
int N = 10000;
int L = 25;
std::vector<codejam::long_int> cryptedMessage{
3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722, 2376497, 611683, 489059,
2328901, 3150061, 829981, 421301, 76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722,
2376497, 611683, 489059,2328901, 3150061, 829981, 421301,
76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
4267589, 4729181, 17524747, 62710561};
message = decrypt(N, L, cryptedMessage);
std::cout << message << std::endl;
std::cout << "Decrypted Message: " << message << std::endl;
ASSERT_EQ(message, "");
}


TEST (CryptoPangrams, SimpleStrings){
TEST (CryptoPangrams, SimpleStrings1){
std :: string message;
int N = 10000;
int L = 26;
std::vector<codejam::long_int> cryptedMessage{
3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722, 2376497, 611683, 489059,
2328901, 3150061, 829981, 421301, 76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722,
2376497, 611683, 489059,2328901, 3150061, 829981, 421301,
76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
4267589, 4729181, 17524747, 62710561};
message = decrypt(N, L, cryptedMessage);
std::cout << message << std::endl;
std::cout << "Decrypted Message: " << message << std::endl;
ASSERT_EQ(message, "SUBDERMATOGLYPHICFJKNQVWXZZ");
}

TEST (CryptoPangrams, SimpleStrings2){
std :: string message;
int N = 103;
int L = 31;
std::vector<codejam::long_int> cryptedMessage{
217, 1891, 4819, 2291, 2987, 3811, 1739, 2491, 4717, 445,
65, 1079, 8383, 5353, 901, 187, 649, 1003, 697, 3239, 7663,
291, 123, 779, 1007, 3551, 1943, 2117, 1679, 989, 3053};
message = decrypt(N, L, cryptedMessage);
std::cout << "Decrypted Message: " << message << std::endl;
ASSERT_EQ(message, "CJQUIZKNOWBEVYOFDPFLUXALGORITHMS");
}

//[TODO] Generate Complex Test Cases with large datasets

int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
Expand Down
69 changes: 69 additions & 0 deletions tests/topK_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "src/topK.cpp"
#include <gtest/gtest.h>


TEST (SimpleIntVector, NullInputTest){
std::vector<int> data{};
int topK = 3;
auto resultVector = getTopK(data, topK);
ASSERT_EQ(resultVector.size(), 0);
}

TEST (SimpleIntVector, KLargerThanSizeOfData){
std::vector<int> data{1, 2, 3, 7, 3, 3, 2, 2, 3, 1};
int topK = 11;
auto resultVector = getTopK(data, topK);
ASSERT_EQ(resultVector.size(), 0);
}

TEST (SimpleIntVector, Sample1){
std::vector<int> data{1, 2, 3, 7, 3, 3, 2, 2, 3, 1};
int topK = 3;
auto resultVector = getTopK(data, topK);
ASSERT_EQ(resultVector[0], 3);
ASSERT_EQ(resultVector[1], 2);
ASSERT_EQ(resultVector[2], 1);
}

TEST (SimpleIntVector, Sample2){
std::vector<int> data{1, 2, 3, 7, 7, 7, 7, 7, 3, 3, 2, 2, 3, 1};
int topK = 7;
auto resultVector = getTopK(data, topK);
ASSERT_EQ(resultVector[0], 7);
ASSERT_EQ(resultVector[1], 3);
ASSERT_EQ(resultVector[2], 2);
ASSERT_EQ(resultVector[3], 1);
}

TEST (SimpleIntVector, SampleCharacters){
std::vector<char> data{'p','a','n','o','r','a','m','a','a','n','d','m','e','m','o','r','a','n','d','u','m'};
int topK = 2;
auto resultVector = getTopK(data, topK);
ASSERT_EQ(resultVector[0], 'a');
ASSERT_EQ(resultVector[1], 'm');
}

// Expect ascending order for equal frequency
TEST (SimpleIntVector, SampleNonRepeatingCharacters){
std::vector<char> data{'c','d','e','l','m','o','x','y','z','a','b'};
int topK = 2;
auto resultVector = getTopK(data, topK);
ASSERT_EQ(resultVector[0], 'a');
ASSERT_EQ(resultVector[1], 'b');
}


TEST (SimpleIntVector, SampleStrings){
std::vector<std::string> data{"There", "are", "many", "many", "apples", "in", "the", "tree",
"but", "the", "apples", "are", "really", "really", "small",
"than", "the", "apples", "from", "last", "year"};
int topK = 2;
auto resultVector = getTopK(data, topK);
ASSERT_EQ(resultVector[0], "apples");
ASSERT_EQ(resultVector[1], "the");
}

int main(int argc, char** argv){
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

0 comments on commit daf5145

Please sign in to comment.