#1
  1. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Dec 2013
    Posts
    7
    Rep Power
    0

    Trouble comparing elements of large files


    Hey guys,

    I've been plugging away at this and keep running into compiler errors or the program simply not functioning as intended.

    I'm generating chains of numbers from file1 and comparing and checking them to file2. If the generated chain is within certain lengths then it's output to a file containing the newly generated strings.

    For the record, file1 is 8k integers and file2 is 27mil integers.
    I need to do this for 3 different mathematical formulas while still using the same table of numbers.

    The integers are generally stored in a file in this format and are sorted numerically from low to high:
    1 2 3 4 5 6 7 8
    9 10 11 12 13 14 15 16
    ...
    ...
    Although sometimes there are extra spaces because running Replace with notepadd++ across 27mil integers causes freezes.

    Here's what I was thinking/trying:
    Code:
    #include <iostream>
    #include <fstream>
    #include <algorithm>
    #include <vector>
    #include <string.h>
    #include <stdlib.h>
    using namespace std; using std::vector; using std::string;
    
    // reads the file and store it in a vector
    void Read(string filename, vector<long> &output);
    
    //calculates all the sets of chains for the first equation
    void Chain1(vector<long> &dataFile1, vector<long> &dataFile2);
    
    //second set of equations for another outputfile
    void Chain2(vector<long> &adataFile1, vector<long> &dataFile2);
    
    //third set
    void Chain3(vector<long> &dataFile1, vector<long> &dataFile2);
    
    int main()
    {
        //vector for file1, 8k integers
        vector<long> FromFile1;
    
        //vector for file2, 27mil integers
        vector<long> FromFile2;
        string file1 = "one.txt";
        string file2 = "two.txt";
    
        Read(file1, FromFile1);
        Read(file2, FromFile2);
    
        Chain1(FromFile1,FromFile2);
        Chain2(FromFile1,FromFile2);
        Chain3(FromFile1,FromFile2);
    }
    
    void Read(string filename, vector<long> &output)
    {
    	ifstream file(filename);
    
            vector<char *> dataFromFile;
    	char *cstr;
    	if(file.is_open())
    	{
    		do
    		{
    			cstr = (char *)malloc(256);
    			file.getline(cstr, 256);
    			dataFromFile.push_back(cstr);
    		}while(!file.eof());
    		cout << "File successfully read" << endl;
    	}
    	else
    		cout << "Failed to open file" << endl;
    
     	vector<char *> tokens;
    	char *pChar;
            int numberFromFile;
    
    	for(int i = 0; i < dataFromFile.size(); i++)
    	{
    		pChar = strtok(dataFromFile[i], " ");
    		while (pChar != NULL)
    		{
    			tokens.push_back(pChar);
    			pChar = strtok (NULL, " ");
    		}
                    numberFromFile = atoi(tokens[0]);
    		output.push_back(numberFromFile);
                    tokens.clear();
    	}
    
                 //free the memory allocated earlier
    	for(int i = 0; i < dataFromFile.size(); i++)
    	{
    		free(dataFromFile[i]);
    	}
    
                 //close the file
    	file.close();
    }
    
    
    
    // function to create chains
    void Chain1(vector<long> &dataFile1, vector<long> &dataFile2)
    {
       // three iterator ints (for loop is giving my compiler fits)
       int i =0;
       int g = 0;
       int n = 0;
    
       // storage vector for the number chains as they're generated
       vector<int> chains;
    
       //variable for the current number being tested to be added to
       //chains
       int testNumber = 0;
    
       //open file to output chains to
       ofstream compChains("Chains1.txt");
    
    while (i<dataFile1.size())
        {
    
             //initialize testNumber to current element in dataFile1
             //being access sequentially
             testNumber = dataFile1[i];
    
             //add testNumber to chains vector so the initial
             //number from the smaller file is contained within
             chains.push_back(testNumber);
    
             do
            {
                // perform simple equation to find the next possible
                //number in the chain
                testNumber = testNumber * 2 + 1;
    
                //test the newly generated number to see if it is in the
                // larger list of numbers (I know find doesn't work for
                //vectors)
                if (testNumber==dataFile2[dataFile2.find(testNumber)])
    
                {
                    //add newly generated number if it passes and
                    //iterate
                    chains.push_back(cPrime);
                    g++;
                }
                else
                {
                    //If new number isn't on list break out and check
                    //the chain
                    break;
                }
                if(chains.size()>=10)
                {
                    //break out and check the chain if number of
                    //elements is 10 or more
                    break;
                }
    
            }while(g<dataFile2.size());
    
        //chains of 6 numbers or more are the only ones outputted to
        // a new file and cout so i can check on progress
        if (chains.size()>5)
            {
                cout << "Success!" << endl;
                while (n<chains.size())
                {
                    cout << chains[n] << ' ';
                    compChains << chains[n] << ' ';
                    n++;
                }
                cout << endl;
                compChains << endl;
            }
            //clear chains and iterate one to test the next element in
            // dataFile1
            chains.clear();
            i++;
        }
    }
    
    
    
    void Chain2(vector<long> &dataFile1; vector<long> &dataFile2)
    {
           // same format just different equation to generate new
           //number
    }
    void Chain3(vector<long> &dataFile1; vector<long> &dataFile2)
    {
           // same format just different equation to generate new
           //number
    }
    I'm mostly using vectors here but not being able to easily search them is a pain and doing a linear search on a 27mil element vector up to 10 times each of the 8k elements in the original vector is... computer melting.

    So what kind of container should I use?
    I only have ints going into the dataFile containers so this container doesn't need to be templated out with multiple members it just needs to be quickly accessed by index and by searching.

    I've been wailing away at this for 3 days now and it either gives me compiler errors because xContainer can't compare int/iterator/whatever or it's not indexing and pulling the files correctly or something along those lines.

    I could use some help.
    Last edited by Maligned; December 9th, 2013 at 08:32 PM. Reason: updated code
  2. #2
  3. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Dec 2013
    Posts
    7
    Rep Power
    0
    updated the code to reflect fixes i made in regards to listing the wrong variable/vector
  4. #3
  5. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,841
    Rep Power
    480
    You've mentioned "binary search". Objects in vectors are not sorted. The "find" method you actually invoke must use linear search. See if

    std::binary_search


    helps.
    [code]Code tags[/code] are essential for python code and Makefiles!
  6. #4
  7. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Dec 2013
    Posts
    7
    Rep Power
    0
    Originally Posted by b49P23TIvg
    You've mentioned "binary search". Objects in vectors are not sorted. The "find" method you actually invoke must use linear search. See if

    std::binary_search


    helps.
    I swapped terms there accidentally (I meant linear search will take forever). But, in this case the vector actually will be "sorted" simply because the data being entered into the vector is listed from lowest number to highest. So vector[0], would be the lowest number in the vector and vector[(vector.size()-1)] would be the highest.

    I think i have the searching function down(i won't be using find), but my problem right now is getting the files to store into the vector properly, i.e. 1 element for each integer. The integers are separated by spaces and '\n' but sometimes there's 2 spaces in front and sometimes there's just 1 and it's giving me fits.
  8. #5
  9. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,841
    Rep Power
    480
    I understand that the data is sorted.

    You need to tell the computer it's sorted.

    And you do that with binary_search instead of with find.
    [code]Code tags[/code] are essential for python code and Makefiles!
  10. #6
  11. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Dec 2013
    Posts
    7
    Rep Power
    0
    Originally Posted by b49P23TIvg
    I understand that the data is sorted.

    You need to tell the computer it's sorted.

    And you do that with binary_search instead of with find.
    I put find in there just for the sake of it being more understandable. I have no intention of doing find.

    my real issue right now is getting the data off of the file and into a vector where vector<int> myVect[i]; points to a single integer and not a line of strings. Not that i could write a line of strings into an integer vector.
  12. #7
  13. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,841
    Rep Power
    480
    at the risk of your explaining that you had no intention of actually using the code you posted, and being a c programmer not a c++ programmer, I would not store the entire dang input file.
    Code:
    void Read(string filename, vector<long> &output) {
      FILE*inf fopen((magical conversion from string to char*)filename,"r");
      if(NULL != inf) {
        for(;;) {
          int datum;
          if (1 != fscanf(inf,"%d",&datum))
    	break;
          output.push_back(datum);
        }
        fclose(ouf);
      }
    }
    [code]Code tags[/code] are essential for python code and Makefiles!
  14. #8
  15. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Dec 2013
    Posts
    7
    Rep Power
    0
    Originally Posted by b49P23TIvg
    at the risk of your explaining that you had no intention of actually using the code you posted, and being a c programmer not a c++ programmer, I would not store the entire dang input file.
    I apologize if i came off as snappy but this has been eluding me for for a few days. That was the code i was trying but it wasn't working so i searched around the internet for several hours trying all the examples and different ways i could find of performing this Read function but they either: didn't compile correctly despite me having the required headers and declarations, or vomited out random integers from the file in no semblance of an order before infinite looping.
    So, i came here for some help since I'm neither a C or C++ programmer and haven't written a program like this is years.
  16. #9
  17. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,841
    Rep Power
    480
    Some how you'll need to convert the string data type to char* as an ASCIIz string. If you need to use the string just once, and that's all, according to the internet site I found there's a c_str method:


    FILE*inf fopen(filename.c_str(),"r");
    [code]Code tags[/code] are essential for python code and Makefiles!
  18. #10
  19. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Dec 2013
    Posts
    7
    Rep Power
    0
    Originally Posted by b49P23TIvg
    Some how you'll need to convert the string data type to char* as an ASCIIz string. If you need to use the string just once, and that's all, according to the internet site I found there's a c_str method:


    FILE*inf fopen(filename.c_str(),"r");
    I got that down but it's crashing after it checks if(NULL !=inf)

    Code:
    void Read(string filename, vector<long> &output) {
    FILE*inf = fopen(filename.c_str(),"r");
    cout << 1 << endl;
    if(NULL != inf) {
        cout << 2 <<endl;
         for(;;) {
           cout << 3 << endl;
           long datum;
           if (1 != fscanf(inf,"%ld",&datum)){
                 cout << 4 << endl;
                 break;
           }
           output.push_back(datum);
         }
         fclose(inf);
       }
     }
    ran this and got 1 before crash so i know it's breaking right at the first branch. returns status -1073741510

    edit: i tried harcoding the filename in and got the same result.
    some of these integers are over 100,000,000 and there are 27million of them should I be calling _wfopen instead?
    changed the name of the file and was able to get it to the break but once it pushes to output it crashes.
  20. #11
  21. Contributing User
    Devshed Demi-God (4500 - 4999 posts)

    Join Date
    Aug 2011
    Posts
    4,841
    Rep Power
    480
    For input file named a with content

    1 2 3
    4 5 6 7

    this program
    Code:
    #include<iostream>
    #include<fstream>
    #include<algorithm>
    #include<vector>
    #include<string.h>
    #include<stdlib.h>
    #include<stdio.h>
    
    void Read(std::string filename,std::vector<long>&output) {
      FILE*inf = fopen(filename.c_str(),"r");
      if (NULL != inf) {
        for(;;) {
          long datum;
          if (1 != fscanf(inf,"%ld",&datum))
    	break;
          output.push_back(datum);
        }
        fclose(inf);
      }
    }
    
    int main() {
      std::vector<long>v;
      Read(std::string("a"),v);
      for (unsigned i=0; i < v.size(); ++i)
        printf("%ld\n",v[i]);
      return 0;
    }
    works as I expect, which is reasonable. I don't think a wide character version of fopen helps in any way.

    Use sed emacs or gawk to modify your files. Stay away from notepad!
    [code]Code tags[/code] are essential for python code and Makefiles!
  22. #12
  23. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Dec 2013
    Posts
    7
    Rep Power
    0
    Originally Posted by b49P23TIvg
    Use sed emacs or gawk to modify your files. Stay away from notepad!
    got it working, I just needed to rename the file outside and inside the program and rebuild the workstation. Something funny was stuck in memory for the file it seems but it's working as intended now.

    thanks a lot.

IMN logo majestic logo threadwatch logo seochat tools logo