Extract Top K elements from data

robosherpa · Apr 12, 2019 · daf5145 · daf5145
1 parent 0e8ce4d
commit daf5145
Show file tree

Hide file tree

Showing 6 changed files with 212 additions and 37 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 2.6)
-project(cryptopangrams)
+project(sample-code)
 
 # Locate GTest
 find_package(GTest REQUIRED)
@@ -11,12 +11,24 @@ include_directories(
 
 # Link executables
 add_executable(
-    cryptopangramsTests
+    sampleCryptoPangramsTest
     tests/cryptopangrams_test.cpp
     )
 
 target_link_libraries(
-    cryptopangramsTests
+    sampleCryptoPangramsTest
+    ${GTEST_LIBRARIES}
+    pthread
+    )
+
+# Link executables
+add_executable(
+    sampleTopKTest
+    tests/topK_test.cpp
+    )
+
+target_link_libraries(
+    sampleTopKTest
     ${GTEST_LIBRARIES}
     pthread
     )
diff --git a/README.MD b/README.MD
@@ -1,4 +1,14 @@
-# Problem
+# Problem #1
+
+Given a vector in input data, return to TopK most frequently read data in the input stream. Make appropriate assumptions.
+
+## Input
+The input could be a vector of integer, floating point numbers or characters or strings.
+
+## Output
+A vector of top K most repeating input data
+
+# Problem #2
 
 On the Code Jam team, we enjoy sending each other pangrams, which are phrases that use each letter of the English alphabet at least once. One common example of a pangram is "the quick brown fox jumps over the lazy dog". Sometimes our pangrams contain confidential information — for example, CJ QUIZ: KNOW BEVY OF DP FLUX ALGORITHMS — so we need to keep them secure.
 
@@ -13,44 +23,42 @@ For example, suppose that N = 103 and we chose to use the first 26 odd prime num
 
 We will give you a ciphertext message and the value of N that we used. We will not tell you which primes we used, or how to decrypt the ciphertext. Do you think you can recover the plaintext anyway?
 
-#Input
+## Input
 The first line of the input gives the number of test cases, T. T test cases follow; each test case consists of two lines. The first line contains two integers: N, as described above, and L, the length of the list of values in the ciphertext. The second line contains L integers: the list of values in the ciphertext.
 
-#Output
+## Output
 For each test case, output one line containing Case #x: y, where x is the test case number (starting from 1) and y is a string of L + 1 uppercase English alphabet letters: the plaintext.
 
-#Limits
+## Limits
 1 ≤ T ≤ 100.
+
 Time limit: 20 seconds per test set.
+
 Memory limit: 1 GB.
+
 25 ≤ L ≤ 100.
+
 The plaintext contains each English alphabet letter at least once.
 
 Test set 1 (Visible)
+
 101 ≤ N ≤ 10000.
 
 Test set 2 (Hidden)
-101 ≤ N ≤ 10100.
-
-#Sample Input
 
-10000 25
-3292937 175597 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 5335543/2411*7919 7919*7919
-
-# Test Cases
+101 ≤ N ≤ 10100.
 
+## Sample Input
 10000 26
+
 3292937 175597 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 17524747 62710561
+
 103 31
+
 217 1891 4819 2291 2987 3811 1739 2491 4717 445 65 1079 8383 5353 901 187 649 1003 697 3239 7663 291 123 779 1007 3551 1943 2117 1679 989 3053
-103 32
-217 961 1891 4819 2291 2987 3811 1739 2491 4717 445 65 1079 8383 5353 901 187 649 1003 697 3239 7663 291 123 779 1007 3551 1943 2117 1679 989 3053
-10000 25
-3292937 175597 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 5335543
-10000 28
-3292937 3892729 175597 7921 7921 18779 50429 375469 1651121 2102 3722 2376497 611683 489059 2328901 3150061 829981 421301 76409 38477 291931 730241 959821 1664197 3057407 4267589 4729181 5335543
 
+# Sample Output
+
+SUBDERMATOGLYPHICFJKNQVWXZZ
 
-Example Output
-SUUUUUUUUUUUUUUUUUUBBBDERMATOGLYPHICFJKNQVWXZZZZZZ
-ZUUUUUUUUUUUUUUUUUUBBBDERMATOGLYPHICFJKNQVWXZZZZZZZ
+CJQUIZKNOWBEVYOFDPFLUXALGORITHMS
diff --git a/src/cryptopangrams.cpp b/src/cryptopangrams.cpp
@@ -1,15 +1,15 @@
-#include <string>
-#include <iostream>
-#include <vector>
-#include <utility>
+#include <chrono>
 #include <cmath>
-#include <limits>
-#include <deque>
 #include <algorithm>
+#include <deque>
+#include <limits>
+#include <iostream>
 #include <sstream>
 #include <set>
+#include <string>
 #include <thread>
-#include <chrono>
+#include <utility>
+#include <vector>
 
 namespace codejam
 {

diff --git a/src/topK.cpp b/src/topK.cpp
@@ -0,0 +1,69 @@
+#include <algorithm>
+#include <iostream>
+#include <set>
+#include <string>
+#include <vector>
+
+/**
+ * A function that compares second values for a pair of pairs
+ *
+ * @param pair1 First input pair.
+ * @param pair2 Second input pair.
+ * @return Returns true if second value of first input pair is larger
+ * than that of the second input pair
+ */
+template<class T>
+bool sortbysecond(std::pair<T,int> pairOne, std::pair<T, int> pairTwo){
+  return (pairOne.second > pairTwo.second);
+}
+
+/**
+ * A function that returns topK values in a given data
+ *
+ * @param data Vector of data.
+ * @return Returns a vector of topK repeating values in the data
+ */
+template<class T>
+std::vector<T> getTopK(const std::vector<T>& data, int topK){
+  std::vector<T> resultVector;
+  resultVector.clear();
+
+  // Invalid input: when data size is less than topK value
+  if(data.size() < topK) return resultVector;
+
+  // Sort Values and copy to resultVector
+  resultVector = data;
+  std::sort(resultVector.begin(), resultVector.end());
+
+  // Create a Vector of pairs for each individual value in input data and its total frequency
+  resultVector.erase(std::unique(resultVector.begin(), resultVector.end()), resultVector.end());
+  std::vector <std::pair<T,int> > dataCounter;
+  for( const auto& it : resultVector){
+    dataCounter.push_back(std::make_pair(it,0));
+  }
+
+  // Iterate over data to count number of repetitions
+  for( const auto& it : data)
+  {
+    for (auto& dataCounterIt : dataCounter)
+    {
+      if(it == dataCounterIt.first)
+        dataCounterIt.second++;
+    }
+  }
+
+  // Sort the pair vector in by number of repetitions (TopK)
+  std::sort(dataCounter.begin(), dataCounter.end(), sortbysecond<T>);
+
+  // Store topK values in resultVector and return
+  resultVector.clear();
+  int topCounter = 0;
+  for (const auto& dataCounterIt : dataCounter)
+  {
+    resultVector.push_back(dataCounterIt.first);
+    topCounter++;
+    if(topCounter > topK) break;
+  }
+
+  return resultVector;
+}
diff --git a/tests/cryptopangrams_test.cpp b/tests/cryptopangrams_test.cpp
@@ -6,28 +6,45 @@ TEST (CryptoPangrams, InvalidSizeL){
   int N = 10000;
   int L = 25;
   std::vector<codejam::long_int> cryptedMessage{
-    3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722, 2376497, 611683, 489059,
-    2328901, 3150061, 829981, 421301, 76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
+    3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722,
+    2376497, 611683, 489059,2328901, 3150061, 829981, 421301,
+    76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
     4267589, 4729181, 17524747, 62710561};
   message = decrypt(N, L, cryptedMessage);
-  std::cout << message << std::endl;
+  std::cout << "Decrypted Message: " << message << std::endl;
   ASSERT_EQ(message, "");
 }
 
 
-TEST (CryptoPangrams, SimpleStrings){
+TEST (CryptoPangrams, SimpleStrings1){
   std :: string message;
   int N = 10000;
   int L = 26;
   std::vector<codejam::long_int> cryptedMessage{
-    3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722, 2376497, 611683, 489059,
-    2328901, 3150061, 829981, 421301, 76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
+    3292937, 175597, 18779, 50429, 375469, 1651121, 2102, 3722,
+    2376497, 611683, 489059,2328901, 3150061, 829981, 421301,
+    76409, 38477, 291931, 730241, 959821, 1664197, 3057407,
     4267589, 4729181, 17524747, 62710561};
   message = decrypt(N, L, cryptedMessage);
-  std::cout << message << std::endl;
+  std::cout << "Decrypted Message: " << message << std::endl;
   ASSERT_EQ(message, "SUBDERMATOGLYPHICFJKNQVWXZZ");
 }
 
+TEST (CryptoPangrams, SimpleStrings2){
+  std :: string message;
+  int N = 103;
+  int L = 31;
+  std::vector<codejam::long_int> cryptedMessage{
+    217, 1891, 4819, 2291, 2987, 3811, 1739, 2491, 4717, 445,
+    65, 1079, 8383, 5353, 901, 187, 649, 1003, 697, 3239, 7663,
+    291, 123, 779, 1007, 3551, 1943, 2117, 1679, 989, 3053};
+  message = decrypt(N, L, cryptedMessage);
+  std::cout << "Decrypted Message: " << message << std::endl;
+  ASSERT_EQ(message, "CJQUIZKNOWBEVYOFDPFLUXALGORITHMS");
+}
+
+//[TODO] Generate Complex Test Cases with large datasets
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();

diff --git a/tests/topK_test.cpp b/tests/topK_test.cpp
@@ -0,0 +1,69 @@
+#include "src/topK.cpp"
+#include <gtest/gtest.h>
+
+
+TEST (SimpleIntVector, NullInputTest){
+  std::vector<int> data{};
+  int topK = 3;
+  auto resultVector = getTopK(data, topK);
+  ASSERT_EQ(resultVector.size(), 0);
+}
+
+TEST (SimpleIntVector, KLargerThanSizeOfData){
+  std::vector<int> data{1, 2, 3, 7, 3, 3, 2, 2, 3, 1};
+  int topK = 11;
+  auto resultVector = getTopK(data, topK);
+  ASSERT_EQ(resultVector.size(), 0);
+}
+
+TEST (SimpleIntVector, Sample1){
+  std::vector<int> data{1, 2, 3, 7, 3, 3, 2, 2, 3, 1};
+  int topK = 3;
+  auto resultVector = getTopK(data, topK);
+  ASSERT_EQ(resultVector[0], 3);
+  ASSERT_EQ(resultVector[1], 2);
+  ASSERT_EQ(resultVector[2], 1);
+}
+
+TEST (SimpleIntVector, Sample2){
+  std::vector<int> data{1, 2, 3, 7, 7, 7, 7, 7, 3, 3, 2, 2, 3, 1};
+  int topK = 7;
+  auto resultVector = getTopK(data, topK);
+  ASSERT_EQ(resultVector[0], 7);
+  ASSERT_EQ(resultVector[1], 3);
+  ASSERT_EQ(resultVector[2], 2);
+  ASSERT_EQ(resultVector[3], 1);
+}
+
+TEST (SimpleIntVector, SampleCharacters){
+  std::vector<char> data{'p','a','n','o','r','a','m','a','a','n','d','m','e','m','o','r','a','n','d','u','m'};
+  int topK = 2;
+  auto resultVector = getTopK(data, topK);
+  ASSERT_EQ(resultVector[0], 'a');
+  ASSERT_EQ(resultVector[1], 'm');
+}
+
+// Expect ascending order for equal frequency
+TEST (SimpleIntVector, SampleNonRepeatingCharacters){
+  std::vector<char> data{'c','d','e','l','m','o','x','y','z','a','b'};
+  int topK = 2;
+  auto resultVector = getTopK(data, topK);
+  ASSERT_EQ(resultVector[0], 'a');
+  ASSERT_EQ(resultVector[1], 'b');
+}
+
+
+TEST (SimpleIntVector, SampleStrings){
+  std::vector<std::string> data{"There", "are", "many", "many", "apples", "in", "the", "tree",
+                                "but", "the", "apples", "are", "really", "really", "small",
+                                "than", "the", "apples", "from", "last", "year"};
+  int topK = 2;
+  auto resultVector = getTopK(data, topK);
+  ASSERT_EQ(resultVector[0], "apples");
+  ASSERT_EQ(resultVector[1], "the");
+}
+
+int main(int argc, char** argv){
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}