Main Page/Research/MSB/Scripts/msb remdupes.pl

From phurvitz
Jump to: navigation, search
#! /usr/bin/perl -w
# remove records with duplicate timestamps

# handle args
if ("$#ARGV" == -1) {
  print "usage: $0 <infile>\n";
}

# open the input file
$infile = "$ARGV[0]";
if (!-e $infile) {
  die "$infile does not exist!\n";
}
open (INFILE, "$infile");

# open the output file
($outfile = $infile) =~ s/csv/fixed\.csv/;
open (OUTFILE, ">$outfile");

# open the error file
($errfile = $infile) =~ s/csv/error\.csv/;
open (ERRFILE, ">$errfile");


# start reading the file
while ($record = <INFILE>) {
    chomp $record;

    # parse out the line into elements and get the timestamp
    @elements = split(/,/, $record);
    $this_timestamp = $elements[2];

    # write out this line
    if ($. == 1) {
        print OUTFILE "$record\n";
	$prev_timestamp = $this_timestamp;
	#$record = <INFILE>;
    }

    @elements = split(/,/, $record);
    $this_timestamp = $elements[2];

    # if this is the last line then write to output
    #print "$prev_timestamp $this_timestamp\n";

    # if the next line's unix time is same as the first line
    # skip and move to next
    if ($prev_timestamp != $this_timestamp) {
	print OUTFILE "$record\n";
    }
    else {
        if ($. != 1) {
            print ERRFILE "$.\n";
        }
    }
    
    # increment the timestamp to the next record
    $prev_timestamp = $this_timestamp;
}


# close the input file
close (INFILE);

# close the output file
close (OUTFILE);

# close the errput file
close (ERRFILE);