# Cleans a CSV file of common syntax errors. csvclean -n myfile.csv # Shows you stats about the file (e.g. col names, type, nulls, min, max, median, std deviation, unique values, most frequent) $csvstat myfile.csv # Peak at excel file to display in terminal $in2csv myfile.xlsx $in2csv myfile.xlsx > myfile.csv # can write xlsx file to csv by chaining operations # Look at data, formats nicely with borders $csvlook myfile.csv # See column names $csvcut -n myfile.csv # Cut out specific columns $csvcut -c 2,5,6 myfile.csv # Cut out columns to just 2,5,6 $csvcut -c col1,col2,col3 myfile.csv # can also specify by name (make sure not to leave spaces) # Chain commands to pass data along using the pipeline symbol | $csvcut -c RespondentID,CollectorID,StartDate,EndDate Sheet_1.csv | csvlook | head # Find cells matching a regular expression (e.g. pattern "ddd-123-dddd") $csvgrep -c phone_number -r "d{3}-123-d{4}" mydata.csv > matchrx.csv # Merge csv files together so you can do aggregate analysis (assuming they have the same headers) $csvstack firstfile.csv secondfile.csv > combinedfile.csv # You can add an additional '-g' flag on csvstack to add a 'grouping column' (e.g. it'll say firstfile, secondfile) # Execute a SQL query directly on a CSV file $csvsql --query "SELECT * FROM mydata WHERE AGE > 30" mydata.csv > query_results.csv # Sort data using csvsort $csvcut -c StartDate,EndDate myfile.csv | csvsort -c StartDate | csvlook | head # use -r to reverse order (i.e. desc) for csvsort # Execute a SQL query by referencing two CSV files and doing a SQL Join $csvsql --query "select m.usda_id, avg(i.sepal_length) as mean_sepal_length from iris as i join irismeta as m on (i.species = m.species) group by m.species" examples/iris.csv examples/irismeta.csv # Turn data (with a unique id col) to be used as a JSON lookup table (or GeoJSON if coordinates available) #[myfile.csv] #slug,place,latitude,longitude #dcl,Downtown Coffee Lounge,32.35066,-95.30181 #tyler-museum,Tyler Museum of Art,32.33396,-95.28174 $csvjson --key slug --indent 4 myfile.csv # Convert from CSV to JSON lookup $csvjson --lat latitude --lon longitude --key slug --crs EPSG:4269 --indent 4 myfile.csv > myfile.json # Convert from CSV to GeoJSON # Generate a create table statement for your csv data $csvsql -i sqlite myfile.csv # Can specify type of db with '-i' flag, other db's include mysql, mssql # Automatically create a SQL table and import a CSV into the database (postgresql) $createdb dbname $csvsql --db postgresql://username:password@localhost/dbname --insert mydata.csv $psql -q dbname -c "d mydata" # shows table, col names, data types $psql -q dbname -c "SELECT * FROM mydata" $ Extract a table from a SQL database into a CSV sql2csv --db postgresql://username:password@localhost/dbname --query "select * from mydata" > extract.csv sql2csv --db mssql://username:password@servername/dbname --query "select top 100 * from [dbname].[dbo].[tablename]" > extract.csv
References:
- https://csvkit.readthedocs.org
- http://www.anthonydebarros.com/2011/09/11/csvkit-data-files/
- https://jonlabelle.com/snippets/view/shell/csvkit-commands
- http://jeroenjanssens.com/2013/09/19/seven-command-line-tools-for-data-science.html
- http://csvkit.readthedocs.org/en/latest/tutorial/3_power_tools.html
- https://csvkit.readthedocs.org/en/0.9.1/scripts/csvclean.html