PHP Script for Converting RPTL 1590 Reports into Excel Files

Real Property Tax Law 1590 requires that municipalities post their tax rolls, within 10 days of the proposed and final rolls being approved. Below is an PHP script that will extract the reports into a CSV file for importing into Microsoft Excel or a GIS program. It extracts the text from the PDF using pdftotext from the poppler-util.

If you do not want to install poppler-util, I would encourage to check out the simpler and better maintained R Script for for Converting RPTL 1590 Reports that I also wrote. Both versions can also be found on my GitHub.

<?php

// this program requires pdftotext (a linux program) and PHP version 7.2

// first convert PDF to text
$pdfdir = "input-pdf";
$textdir = "output-txt";

// delete old input-text
if (isset($argv[1]) && $argv[1] == 'delete') {
	echo "Deleting old conversions ...\n";
	system("rm $textdir/*");
}

foreach (scandir($pdfdir) as $file) {
	if (substr($file, -4) !== '.pdf') {
		continue;
	}
	
	$textfile = substr($file, 0, -4).".txt";
	$town = substr($file, 0, -4);
	
	echo ("#### START $town #### \n");
	
	if (file_exists("$textdir/$textfile")) {
		echo "Text file exists, not converting PDF again (arg[1] == delete to override).\n";
	}
	else {
		echo "Converting to text file ...";
		system('pdftotext -layout '.escapeshellarg("$pdfdir/$file").' '.escapeshellarg("$textdir/$textfile"));
		echo " DONE\n";
	}
	
	$text = file("$textdir/$textfile");
	$town = substr($file, 0, -4);

	$taxroll = array();
	$payerId = 0;
	
	$output = "";
	
	$townId = "";
	$swisId = "";
	$countyId = "";
	$villageId = "";
	
	for ($i = 0; $i < count($text); $i++) {
		if ($i % 100 == 0) echo "#";
		
		// capture county - town - swis
		if (preg_match('/COUNTY\s*?- (.*?)\s{2}/', $text[$i], $matches)) $countyId = $matches[1];
		if (preg_match('/CITY\s*?- (.*?)\s{2}/', $text[$i], $matches)) $townId = $matches[1];
		if (preg_match('/TOWN\s*?- (.*?)\s{2}/', $text[$i], $matches)) $townId = $matches[1];
		if (preg_match('/VILLAGE\s*?- (.*?)\s{2}/', $text[$i], $matches)) $villageId = $matches[1];
		if (preg_match('/SWIS\s*?- (.*?)\s{2}/', $text[$i], $matches)) $swisId = $matches[1];

		
		// first line = tax id
		$pattern = '/\*{3,} ((\d|\-|\.){4,}) \*{3,}/';
		preg_match($pattern, $text[$i], $matches);

		// we've found the start of a new tax record!
		if (isset($matches[1])) {
			$i++;
			
			$taxpayer = array();
			$j = 0;
			// output each part onto the line
			while (isset($text[$i]) && !preg_match('/\*{3,}/', $text[$i])) {
				$split = preg_split('/\s{2,}/', $text[$i]);
				
				$taxpayer[$j] = $split;
				$i++; $j++;
			} 
			
			
			$taxpayer[$j] = array('location',$countyId, $townId, $villageId, $swisId);
						
			$taxroll[$payerId++] = $taxpayer;
			$i--;
		}
	}

	// export unprocess tax rolls for debug
	file_put_contents("output-debug/$town.txt", print_r($taxroll,true));
	
	// next scan for all special district types in file
	$specialDistType = array();
	
	foreach ($taxroll as $taxpayer) {
		for ($i = 0; $i < count($taxpayer); $i++) {
				for ($j = 0; $j < count($taxpayer[$i]); $j++) {
					if (preg_match('/^([A-Z]{2})(\d\d\d) (.*?)( TO|$|\d{2,})/', $taxpayer[$i][$j],$matches)) {
						$specialDistType[$matches[1]] = $matches[1];						
					}
				}	
		}
	}
	ksort($specialDistType);

	// then process into a nice field
	$formTax = array();

	foreach ($taxroll as $taxpayer) {
		$formPayer = array();
		
		$formPayer[0] = $taxpayer[1][0]; // tax id
		
		if (isset($taxpayer[0][1]) && preg_match('/^(\d.*?) (.*?)$/',$taxpayer[0][1], $address)) {
			$formPayer[1] = $address[1]; // street number
			$formPayer[2] = ucwords(strtolower($address[2])); // street name
		}
		elseif (isset($taxpayer[0][1]))  {
			$formPayer[1] = '';
			$formPayer[2] = ucwords(strtolower($taxpayer[0][1])); // street name
		}
		
		if (isset($formPayer[1])) $formPayer[23] = ltrim($formPayer[1].' '.$formPayer[2]); // full street 
		else if (isset($formPayer[1])) $formPayer[23] = ltrim($formPayer[2]);
		
		$formPayer[3] = ucwords(strtolower($taxpayer[2][0])); // owner 1
		
		// next five lines are either are owner or address info
		for ($i = 3; $i < 8; $i++) {
			
			if (!isset($taxpayer[$i][0])) continue;
			
			// if a taxpayer name
			if (preg_match('/^[A-Z]/',$taxpayer[$i][0]) && !preg_match('/^PO/',$taxpayer[$i][0]) && !preg_match('/^(.*?), (\w\w) (.*?)$/',$taxpayer[$i][0])) 	{
				
				if (!isset($formPayer[4])) $formPayer[4] = ucwords(strtolower($taxpayer[$i][0]));
				else if (!isset($formPayer[5])) $formPayer[5] = ucwords(strtolower($taxpayer[$i][0]));
				else if (!isset($formPayer[6])) $formPayer[6] = ucwords(strtolower($taxpayer[$i][0]));
			}
			
			// if a city - state - zip
			else if (preg_match('/^(.*?), (\w\w) (.*?)$/',$taxpayer[$i][0], $address)) {
				$formPayer[10] = ucwords(strtolower($address[1]));
				$formPayer[11] = strtoupper($address[2]);
				$formPayer[12] = ucwords(strtolower($address[3]));
			}
			
			// if an address (pad to this field)
			else if (preg_match('/^\d/',$taxpayer[$i][0]) || preg_match('/^PO/',$taxpayer[$i][0])) {
				if (!isset($formPayer[7])) $formPayer[7] =  ucwords(strtolower($taxpayer[$i][0]));
				else if (!isset($formPayer[8])) $formPayer[8] =  ucwords(strtolower($taxpayer[$i][0]));
				else if (!isset($formPayer[9])) $formPayer[9] =  ucwords(strtolower($taxpayer[$i][0]));
			}
		
		$formPayer[13] = $taxpayer[1][1];
	}
		
		// extract coordinates by searching through array
		for ($i = 0; $i < count($taxpayer); $i++) {
			for ($j = 0; $j < count($taxpayer[$i]); $j++) {
				if (preg_match('/EAST-(\d*) NRTH-(\d*)/', $taxpayer[$i][$j], $coord)) {
					$formPayer[14] = $coord[1];
					$formPayer[15] = $coord[2];		
				}
			}
		}
		
		// extract acres
		
			for ($i = 0; $i < count($taxpayer); $i++) {
			for ($j = 0; $j < count($taxpayer[$i]); $j++) {
				if (preg_match('/ACRES *?(\d+)/', $taxpayer[$i][$j],$acres)) {
					$formPayer[16] = $acres[1];
				}
				else if (preg_match('/ACRES/', $taxpayer[$i][$j])) {
					if (preg_match('/^([0-9.]+)/', $taxpayer[$i][$j+1], $acres)) $formPayer[16] = $acres[1];
				}
			}
		}

	// extract full market value

			for ($i = 0; $i < count($taxpayer); $i++) {
			for ($j = 0; $j < count($taxpayer[$i]); $j++) {
				if (preg_match('/FULL MARKET VALUE *?(\d+)/', $taxpayer[$i][$j],$value)) {
					$formPayer[17] = str_replace(',','',$value[1]);
				}
				else if (preg_match('/FULL MARKET VALUE/', $taxpayer[$i][$j])) {
					if (preg_match('/^([0-9,]+)/', $taxpayer[$i][$j+1], $value)) $formPayer[17] = str_replace(',','',$value[1]);
				}
			}
		}
		
		// extract deed book info
			for ($i = 0; $i < count($taxpayer); $i++) {
				for ($j = 0; $j < count($taxpayer[$i]); $j++) {
					
										
					if (preg_match('/DEED BOOK *?(\d+) *?PG-(\d+)/', $taxpayer[$i][$j],$value)) {
						$formPayer[18] = $value[1];
						$formPayer[19] = $value[2];
					}
					else if (preg_match('/DEED BOOK *?(\d+)/', $taxpayer[$i][$j],$value)) {
						$formPayer[18] = $value[1];
						if (isset($taxpayer[$i][$j+1]) && preg_match('/^PG-(\d+)/', $taxpayer[$i][$j+1], $value)) $formPayer[19] = $value[1];
					}
				}
			}
				
			// county taxable amount
			for ($i = 0; $i < count($taxpayer); $i++) {
				for ($j = 0; $j < count($taxpayer[$i]); $j++) {
					if (preg_match('/COUNTY TAXABLE VALUE/', $taxpayer[$i][$j])) $formPayer[20] = chop(str_replace(',','',$taxpayer[$i][$j+1]));
				}
			}

		// school taxable amount
			for ($i = 0; $i < count($taxpayer); $i++) {
				for ($j = 0; $j < count($taxpayer[$i]); $j++) {
					if (preg_match('/SCHOOL TAXABLE VALUE/', $taxpayer[$i][$j])) $formPayer[21] = chop(str_replace(',','',$taxpayer[$i][$j+1]));
				}
			}	
		// city taxable amount
			for ($i = 0; $i < count($taxpayer); $i++) {
				for ($j = 0; $j < count($taxpayer[$i]); $j++) {
					if (isset($taxpayer[$i][$j]) && preg_match('/^(CITY|TOWN)/', $taxpayer[$i][$j])) {
						if (isset($taxpayer[$i][$j+1]) && preg_match('/^TAXABLE VALUE/', $taxpayer[$i][$j+1])) $formPayer[22] =  chop(str_replace(',','',$taxpayer[$i][$j+2]));
						
					}
				}	
			}
	
		
		// field relating to solar power (for munis that have such laws)
		$formPayer[24] = '';
		for ($i = 0; $i < count($taxpayer); $i++) {
			for ($j = 0; $j < count($taxpayer[$i]); $j++) {
				if (preg_match('/solar/i', $taxpayer[$i][$j])) {
					$formPayer[24] .= "{$taxpayer[$i][$j]},";
				}
			}	
		}	
		
		// STAR
		$formPayer[25] = '';
		for ($i = 0; $i < count($taxpayer); $i++) {
			for ($j = 0; $j < count($taxpayer[$i]); $j++) {
				if (preg_match('/ STAR/', $taxpayer[$i][$j])) {
					$formPayer[25] .= "{$taxpayer[$i][$j]},";
				}
			}	
		}
		
		// STAR
		$formPayer[26] = '';
		for ($i = 0; $i < count($taxpayer); $i++) {
			for ($j = 0; $j < count($taxpayer[$i]); $j++) {
				if (preg_match('/(VET WAR|CW_15_VET|VETWAR|VETDIS|VETERANS)/', $taxpayer[$i][$j])) {
					$formPayer[26] .= "{$taxpayer[$i][$j]},";
				}
			}	
		}	
		
		// SCHOOL
		$formPayer[27] = $taxpayer[2][1];	
		
		// columns 28+ are special districts
		$l = 28;
		
		foreach ($specialDistType as $type) {	
			$formPayer[$l] = '';
				
			for ($i = 0; $i < count($taxpayer); $i++) {
				for ($j = 0; $j < count($taxpayer[$i]); $j++) {
					if (isset($taxpayer[$i][$j]) && preg_match('/^(\w\w)(\d\d\d) (.*?)( TO|$|\d{2,})/', $taxpayer[$i][$j],$matches)) {
						if ($matches[1] == $type) $formPayer[$l] .= "{$matches[1]}{$matches[2]} {$matches[3]} ";
					}
				}	
			}
			
			$l++;
		}
		
		
		// sort and add missing keys
		for ($i = 0; $i < count($formPayer); $i++) {
			if (!isset($formPayer[$i])) $formPayer[$i] = '';
		}
		
		
		ksort($formPayer);
		
				// shift onto the rolls county, town, village, swis
		for ($i = 0; $i < count($taxpayer); $i++) {
				
				if ($taxpayer[$i][0] != 'location') continue;
				
				// add array to line				
				for ($j = count($taxpayer[$i])-1; $j > 0; $j--) array_unshift($formPayer, $taxpayer[$i][$j]);
				
		}
		
		
		$formTax[] = $formPayer;
		
		}


		// lastly sort form by street and number
		
	    $addNum = array();
        $addSt = array();
        $own1 = array();
		for ($i = 0; $i < count($formTax); $i++) {
		  $addSt[] = $formTax[$i][6];
		  $addNum[] = $formTax[$i][5]; 
		  $own1[] =  $formTax[$i][7];
		}

		// now apply sort
		array_multisort($addSt, SORT_ASC, 
				$addNum, SORT_NUMERIC, SORT_ASC,
				$own1, SORT_ASC, 
				$formTax);
				
				
	//print_r($formTax);

	echo "\nWriting to CSV ...";

	// print out form
	$output .=  '"Tax Roll","County","Town","Village","SWIS","Tax ID","Street Number","Street Name","Owner 1","Owner 2","Owner 3","Owner 4",'
				.'"Mail Address 1","Mail Address 2","Mail Address 3","Mail City","Mail State","Mail Zip",'
				.'"Property Type","East","North","Acres","Full Market Value","Deed Book","Deed Pg",'
				.'"County Value","School Value","Town Value","Full Street",'
				.'"Solar","STAR","VETS","School",';
				
	foreach ($specialDistType as $type) {
		$output .= "\"$type\",";
	}
				
	$output .=  "\n";

	foreach ($formTax as $line) {
		$output .=  '"'.$town.'",';
		foreach ($line as $item) {
			$output .=  '"'.$item.'",';
		}
		
		$output .=  "\n";
	}
	
	// save output to file
	file_put_contents("output-csv/$town.csv", $output);
	
	echo " DONE\n";
}

// last, create a great big file
//system("cat output-csv/*.csv > all-property.csv");

system("zip output-csv.zip output-csv/*");


1 Comment

  • Brad Stratton says:

    This is great! I wish I had found this 3 years ago. I was wondering if it’s possible to add in the additional Ag District info – ie. “MAY BE SUBJECT TO PAYMENT UNDER AGDIST LAW TIL 2024.” It didn’t transfer over in the test I did.

1 Trackback or Pingback

Leave a Reply

Your email address will not be published. Required fields are marked *