diff --git a/.gitignore b/.gitignore index 33712b8..a701a06 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,13 @@ # STANDARD JULIA IGNORE /Manifest.toml docs/build/ + +# --------------------------------------------------------- + + # --------------------------------------------------------- +# DEVELOPMENT FILES +sandbox.jl +sandbox.R +tmp +# --------------------------------------------------------- \ No newline at end of file diff --git a/Project.toml b/Project.toml index 362b60a..29f9ce0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "FinanceRoutines" uuid = "2e4c0fa2-b49b-4c8f-9592-485f04b9fc03" -authors = "Erik Loualiche " +authors = ["Erik Loualiche "] version = "0.1.0" [deps] @@ -9,15 +9,13 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +FlexiJoins = "e37f2e79-19fa-4eb7-8510-b63b51fe0a37" +IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" +LibPQ = "194296ae-ab2e-5f79-8cd4-7183a0a5a0d1" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +MonthlyDates = "5be0b35e-b7aa-4f8f-be3c-193ee1a845a6" +PanelShift = "d68e4d5e-4a60-4df1-b225-9a1636c75ae0" +ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5" ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" - -[compat] -CSV = "0.10" -julia = "1" - -[extras] -Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["Markdown", "Test"] diff --git a/README.md b/README.md index 5172bd6..dfb7923 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,57 @@ # FinanceRoutines -https://juliadocs.github.io/Documenter.jl/stable +| **Documentation** | **Build Status** | +|:-------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:| +| [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | ![Build Status](https://github.com/eloualiche/FinanceRoutines.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/eloualiche/FinanceRoutines.jl/actions/workflows/CI.yml?query=branch%3Amain) | + + +[!https://eloualiche.github.io/FinanceRoutines.jl/ [![Build Status](https://github.com/eloualiche/FinanceRoutines.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/eloualiche/FinanceRoutines.jl/actions/workflows/CI.yml?query=branch%3Amain) ## Functions 1. Import Financial data - - `ImportFinanceData.jl` \ No newline at end of file + - `import_FF3` + - `build_crsp` + +## To Do + + - Time lags for panel data (if lag and data is not offset by one month, then returns missing). + - `olsgmm` from cochrane GMM code + - rolling regressions + + +## References + +- [WRDS demo on momentum](https://wrds-www.wharton.upenn.edu/documents/1442/wrds_momentum_demo.html) +- Tidy Finance [Book](https://www.tidy-finance.org) and [repo](https://github.com/tidy-finance/website) +- French data [R package](https://nareal.github.io/frenchdata/articles/basic_usage.html) +- Ian Gow [Quarto Book](https://iangow.github.io/far_book/ident.html) +- Replication [Open Source AP](https://github.com/OpenSourceAP/CrossSection/tree/master) + +[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg +[docs-stable-url]: https://eloualiche.github.io/FinanceRoutines.jl/ + +## Examples + +### Import data from WRDS + +First import the monthly stock file and the compustat funda file +```julia +using FinanceRoutines +# Set up a wrds connection +wrds_conn = FinanceRoutines.open_wrds_pg() + +# CRSP +df_msf = import_MSF(wrds_conn); # Import the monthly stock file +df_msf = build_MSF(df_msf); # Run common processing +# Compustat +df_funda = import_Funda(wrds_conn); +df_funda = build_Funda(df_funda); +# Merge both files +df_linktable = FinanceRoutines.import_ccm_link(wrds_conn) +df_msf = link_MSF(df_linktable, df_msf) # merge gvkey on monthly stock file +df_msf = innerjoin(df_msf, df_funda, on = [:gvkey, :date_y], matchmissing=:notequal) +``` + diff --git a/docs/make.jl b/docs/make.jl index 7af99d1..20467a3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -11,4 +11,7 @@ makedocs( ]) deploydocs(; repo="github.com/eloualiche/FinanceRoutines.jl", + target = "build", + branch = "gh-pages", + ) diff --git a/src/FinanceRoutines.jl b/src/FinanceRoutines.jl index bcbe445..62b49c1 100644 --- a/src/FinanceRoutines.jl +++ b/src/FinanceRoutines.jl @@ -1,27 +1,58 @@ module FinanceRoutines -# --------------------------------------------------------- +# ------------------------------------------------------------------------------------------ import Downloads import ZipFile import CSV -import DataFrames: DataFrame, rename! -import DataFramesMeta: DataFramesMeta, @subset!, @transform! -import Dates: Date -# --------------------------------------------------------- - - -# --------------------------------------------------------- +import DataFrames: DataFrame, ByRow, groupby, nrow, passmissing, Not, + rename!, select!, groupby, transform!, leftjoin, disallowmissing! +import DataFramesMeta: DataFramesMeta, + @passmissing, @subset!, @rsubset!, @transform!, @rtransform! +import Dates: Dates, Date, Month, year +import Downloads: Downloads.download +import FlexiJoins: innerjoin, by_key, by_pred +import IntervalSets:(..) +import LibPQ: LibPQ.execute, LibPQ.Connection +import Missings: Missings, missing +import MonthlyDates: MonthlyDate +import PanelShift: panellag! +import ShiftedArrays: lag +import Tables: columntable +import WeakRefStrings: String3, String7, String15 +import ZipFile: ZipFile.Reader +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ # Import functions +include("Utilities.jl") include("ImportFinanceData.jl") -# --------------------------------------------------------- +include("ImportCRSP.jl") +include("ImportComp.jl") +include("Merge_CRSP_Comp.jl") +# ------------------------------------------------------------------------------------------ -# --------------------------------------------------------- +# ------------------------------------------------------------------------------------------ # List of exported functions export greet_FinanceRoutines # for debugging -export import_FF3 # read monthly FF3 -# --------------------------------------------------------- + +# WRDS +# -- CRSP +export import_MSF # import Monthly Stock File +export import_DSF # import Daily Stock File +export build_MSF # clean Monthly Stock File +# -- Funda +export import_Funda +export build_Funda +# -- Link +export link_Funda +export link_MSF + +# FF +export import_FF3 +# ------------------------------------------------------------------------------------------ end diff --git a/src/ImportCRSP.jl b/src/ImportCRSP.jl new file mode 100644 index 0000000..6ec8477 --- /dev/null +++ b/src/ImportCRSP.jl @@ -0,0 +1,358 @@ +# ------------------------------------------------------------------------------------------ +# ImportCRSP.jl + +# Collection of functions that import +# financial data into julia +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +# List of exported functions +# export import_MSF +# export build_MSF + +# list +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +# function list_crsp(; +# wrds_conn, user, password) + +# list_libraries = """ +# WITH RECURSIVE "names"("name") AS ( +# SELECT n.nspname AS "name" +# FROM pg_catalog.pg_namespace n +# WHERE n.nspname !~ '^pg_' +# AND n.nspname <> 'information_schema') +# SELECT "name" +# FROM "names" +# WHERE pg_catalog.has_schema_privilege( +# current_user, "name", 'USAGE') = TRUE; +# """ +# res_list_libraries = execute(wrds_conn, list_libraries); +# df_libraries = DataFrame(columntable(res_list_libraries)) +# @rsubset(df_libraries, occursin(r"crsp", :name) ) + +# library = "crsp" +# list_tables = """ +# SELECT table_name FROM INFORMATION_SCHEMA.views +# WHERE table_schema IN ('$library'); +# """ +# res_list_tables = execute(wrds_conn, list_tables); +# df_tables = DataFrame(columntable(res_list_tables)) +# @rsubset(df_tables, occursin(r"mse", :table_name) ) + +# return run_sql_query(conn, query) + + +# end +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +function import_MSF(wrds_conn::Connection; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + variables::String = "" + ) + +# set up the query for msf + postgre_query_msf = """ + SELECT PERMNO,PERMCO,DATE,PRC,ALTPRC,RET,RETX,SHROUT + FROM crsp.msf + WHERE DATE >= '$(string(date_range[1]))' AND DATE <= '$(string(date_range[2]))' + """ + @time res_q_msf = execute(wrds_conn, postgre_query_msf) + df_msf = DataFrame(columntable(res_q_msf)) + transform!(df_msf, # clean up the dataframe + names(df_msf, check_integer.(eachcol(df_msf))) .=> (x->convert.(Union{Missing, Int}, x)); + renamecols = false); + +# set up the query for mse + # postgre_query = """ + # SELECT DATE, PERMNO, SHRCD, EXCHCD, HEXCD + # FROM crsp.mse + # WHERE EXTRACT('Year' FROM DATE) = 2013 + # """ + # res = LibPQ.execute(wrds_conn, postgre_query) + # df_mse = DataFrame(columntable(res)) + # # convert to Int these flag variables + # transform!(df_mse, + # names(df_mse, Union{Missing, Float64}) .=> (x->convert.(Union{Missing, Int}, x)); + # renamecols = false) + # # @rsubset(df_mse, !ismissing(:shrcd) ) + # df_mse + # @rsubset(df_mse, :hexcd ∈ (1, 2, 3) ) + # @rsubset(df_mse, :shrcd ∈ (10, 11) ) + # df_mse.permno |> unique + +# set up the query for msenames + postgre_query_msenames = """ + SELECT PERMNO, NAMEDT, NAMEENDT, SHRCD, EXCHCD, HEXCD, NAICS, HSICCD, CUSIP + FROM crsp.msenames + """ + res_q_msenames = execute(wrds_conn, postgre_query_msenames) + df_msenames = DataFrame(columntable(res_q_msenames)) ; + transform!(df_msenames, + names(df_msenames, check_integer.(eachcol(df_msenames))) .=> (x->convert.(Union{Missing, Int}, x)); + renamecols = false) ; + df_msenames[!, :cusip] .= String15.(df_msenames[!, :cusip]); + df_msenames[ .!ismissing.(df_msenames.naics) , :naics] .= String7.(skipmissing(df_msenames[!, :naics])); + @rsubset!(df_msenames, :exchcd <= 3 ) ;# we keep negative values + @rsubset!(df_msenames, :shrcd ∈ (10, 11) ) ; + +# set up the query for msedelist + postgre_query_msedelist = """ + SELECT PERMNO, DLSTDT, DLRET, DLSTCD + FROM crsp.msedelist + """ + res_q_msedelist = execute(wrds_conn, postgre_query_msedelist) + df_msedelist = DataFrame(columntable(res_q_msedelist)) ; + transform!(df_msedelist, + names(df_msedelist, check_integer.(eachcol(df_msedelist))) .=> (x->convert.(Union{Missing, Int}, x)); + renamecols = false) ; + @rtransform!(df_msedelist, :datem = MonthlyDate(:dlstdt)); + +# --- merge all of the datasets together + df_msf_final = innerjoin( + (df_msf, df_msenames), + by_key(:permno) & by_pred(:date, ∈, x->x.namedt..x.nameendt) + ) + @rtransform!(df_msf_final, :datem = MonthlyDate(:date) ); + df_msf_final = leftjoin(df_msf_final, df_msedelist, on = [:permno, :datem]) + select!(df_msf_final, + :permno, # Security identifier + :date, # Date of the observation + :datem, + :ret, # Return + :retx, # Return excluding dividends + :shrout, # Shares outstanding (in thousands) + :altprc, # Last traded price in a month + :exchcd, # Exchange code + :hsiccd, # Industry code + :naics, # Industry code + :dlret, # Delisting return + :dlstcd # Delisting code + ) + sort!(df_msf_final, [:permno, :date]); + # unique(df_msf_final, [:permno, :date]) + + return df_msf_final + +end + +# when there are no connections establisheds +function import_MSF(; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + variables::String = "", + user::String = "", password::String = "") + + if user == "" + wrds_conn = open_wrds_pg() + else + wrds_conn = open_wrds_pg(user, password) + end + + import_MSF(wrds_conn, date_range=date_range, variables=variables) +end +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +""" + This comes after import_MSF +""" +function build_MSF(df_msf::DataFrame; + save::String = "" + ) + +# Create marketcap: + @rtransform!(df_msf, :mktcap = abs(:shrout * :altprc)) # in 1000s + df_msf[ isequal.(df_msf.mktcap, 0), :mktcap] .= missing; + +# Lagged marketcap + sort!(df_msf, [:permno, :datem]) + # method 1: lag and then merge back + # df_msf_mktcap_lag = @select(df_msf, + # :datem = :datem + Month(1), :permno, :l1m_mktcap2 = :mktcap) + # df_msf = leftjoin(df_msf, df_msf_mktcap_lag, on = [:permno, :datem]) + panellag!(df_msf, :permno, :datem, + :mktcap, :l1m_mktcap, Month(1)) + +# Adjusted returns (see tidy finance following Bali, Engle, and Murray) + @rtransform! df_msf :ret_adj = + ismissing(:dlstcd) ? :ret : + !ismissing(:dlret) ? :dlret : + (:dlstcd ∈ (500, 520, 580, 584)) || ((:dlstcd >= 551) & (:dlstcd <= 574)) ? -0.3 : + :dlstcd == 100 ? :ret : -1.0 + +# select variables and save + select!(df_msf, :permno, :date, :ret, :mktcap, :l1m_mktcap, :retx, + :naics, :hsiccd) + if !(save == "") + CSV.write(save * "/msf.csv.gz", df_msf, compress=true) + end + + return df_msf +end + + +function build_MSF(wrds_conn::Connection; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + save::Bool = false, + ) + + df_msf = import_MSF(wrds_conn; date_range=date_range); + df_msf = build_msf(df_msf, save = save) + return df_msf +end + + +function build_MSF(; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + save::Bool = false, + ) + + df_msf = import_MSF(;date_range); + df_msf = build_msf(df_msf, save = save) + + return df_msf +end +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +function import_DSF(wrds_conn::Connection; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + variables::String + ) + +# set up the query for msf + postgre_query_dsf = """ + SELECT PERMNO, DATE, RET, PRC, SHROUT, VOL + FROM crsp.dsf + WHERE DATE >= '$(string(date_range[1]))' AND DATE <= '$(string(date_range[2]))' + """ + @time res_q_dsf = execute(wrds_conn, postgre_query_dsf) + @time df_dsf = DataFrame(columntable(res_q_dsf)) + # clean up the dataframe + transform!(df_dsf, + names(df_dsf, check_integer.(eachcol(df_dsf))) .=> (x->convert.(Union{Missing, Int}, x)); + renamecols = false) + + return df_dsf +end + +# when there are no connections establisheds +function import_DSF(; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + variables::String = "", + user::String = "", password::String = "") + + if user == "" + wrds_conn = open_wrds_pg() + else + wrds_conn = open_wrds_pg(user, password) + end + + return import_DSF(wrds_conn, date_range=date_range, variables=variables) +end +# ------------------------------------------------------------------------------------------ + + + + + + + + + + + + + +# # ------------------------------------------------------------------------------------------ +# """ +# Multiple dispatch to load from clean CSV +# """ + +# # What are the classes? +# msf_col_classes = [Int, String, Int, Int, Int, # "PERMNO" "date" "NAMEENDT" "SHRCD" "EXCHCD" +# String, String, String, String, String, # "SICCD" "NCUSIP" "TICKER" "COMNAM" "SHRCLS" +# String, Int, String, String, String, # "TSYMBOL" "NAICS" "PRIMEXCH" "TRDSTAT" "SECSTAT" +# Int, Int, Int, String, String, # "PERMCO" "ISSUNO" "HEXCD" "HSICCD" "CUSIP" +# Int, Float64, Int, Int, Int, # "DCLRDT" "DLAMT" "DLPDT" "DLSTCD" "NEXTDT +# Int, Int, Int, Int, Int, # "PAYDT" "RCRDDT" "SHRFLG" "HSICMG" "HSICIG" +# Int, Float64, Float64, Float64, Int, # "DISTCD" "DIVAMT" "FACPR" "FACSHR" "ACPERM" +# Int, Int, Int, String, Float64, # "ACCOMP" "SHRENDDT" "NWPERM" "DLRETX" "DLPRC" +# String, Int, Int, Int, Int, # "DLRET" "TRTSCD" "NMSIND" "MMCNT" "NSDINX" +# Float64, Float64, Float64, Int, String, # "BIDLO" "ASKHI" "PRC" "VOL" "RET" +# Float64, Float64, Int, Float64, Float64, # "BID" "ASK" "SHROUT" CFACPR" "CFACSHR" +# Float64, Float64, Int, String, # "ALTPRC" "SPREAD" "ALTPRCDT" "RETX" +# Float64, Float64, Float64, Float64, Float64]; # "vwretd" "vwretx" "ewretd" "ewretx" "sprtrn" + +# # KEEP SOME COLUMNS +# col_keep = [:PERMNO, :NAICS, :SICCD, :HSICCD, :date, :PRC, :RET, :SHROUT, +# :HEXCD, :SHRCD] ; +# col_keep = vcat(col_keep, variables); +# # col_keep = intersect(Symbol.(names(df_crsp)), col_keep); + +# # READ THE FILE +# df_msf = CSV.File( +# expanduser(path_to_file); +# header=1, types = msf_col_classes, +# silencewarnings=true, missingstring="NA", delim=',', +# select = col_keep +# ) |> DataFrame; + +# # Lower Case Names +# rename!(df_msf, lowercase.(names(df_msf))); + +# # Filter Stock CLASSES +# @rsubset!(df_msf, :hexcd ∈ (1, 2, 3) ) +# @rsubset!(df_msf, :shrcd ∈ (10, 11) ) + +# # FILTER THE DATE RANGE +# @rtransform!(df_msf, :datey = tryparse(Int, :date[1:4])); +# @subset!(df_msf, :datey .>= date_range[1], :datey .<= date_range[2] ) + +# # CLEAN UP THE MAIN VARIABLES (return | market cap | date) +# @rtransform!(df_msf, :ret = passmissing(tryparse)(Float64, :ret) ); # returns +# @rtransform!(df_msf, :ret = (x -> isnothing(x) ? missing : x)(:ret) ); +# @rtransform!(df_msf, :me = abs(:prc) * :shrout); +# @rtransform!(df_msf, :date = Date(:datey, tryparse(Int, :date[5:6]), tryparse(Int, :date[7:8]))); +# @rtransform!(df_msf, :datem = MonthlyDate(:date) ); + +# # BEFORE LAGS, REMOVE DUPLICATES BY MONTH|PERMNO +# # Look for unique keys within returns and market equity +# unique!(df_msf, [:permno, :datem, :ret, :me]); +# sort!(df_msf, [:permno, :date]); + +# # LAG ME for one month +# @transform!(groupby(df_msf, :permno), +# :l1m_me = lag(:me, 1), :l1m_datem = lag(:datem, 1) ); +# # missing if lag is not one month prior +# @rtransform!(df_msf, +# @passmissing :l1m_me = (:l1m_datem + Dates.Month(1) .== :datem) ? :l1m_me : Base.missing); + +# # CLEAN UP THE COLUMNS +# select!(df_msf, Not(intersect([:shrcd, :hexcd, :l1m_datem], Symbol.(names(df_msf))))) +# select!(df_msf, vcat([:permno, :date, :ret, :l1m_me], Symbol.(names(df_msf))) |> unique) + +# return df_msf + +# end +# # ------------------------------------------------------------------------------------------ + + +# # ------------------------------------------------------------------------------------------ +# # Utilities (non exported) +# function check_integer(x::AbstractVector) +# for i in x +# !(typeof(i) <: Union{Missing, Number}) && return false +# ismissing(i) && continue +# isinteger(i) && continue +# return false +# end +# return true +# end diff --git a/src/ImportComp.jl b/src/ImportComp.jl new file mode 100644 index 0000000..d5d6d37 --- /dev/null +++ b/src/ImportComp.jl @@ -0,0 +1,96 @@ +# ------------------------------------------------------------------------------------------ +# ImportComp.jl + +# Collection of functions that import +# compustat data into julia +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +# List of exported functions +# export import_MSF +# export build_MSF + +# list +# ------------------------------------------------------------------------------------------ + + + +# ------------------------------------------------------------------------------------------ +function import_Funda(wrds_conn::Connection; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + variables::String = "" + ) + + var_funda = ["GVKEY", "DATADATE", "SICH", "FYR", "FYEAR", + "AT", "LT", "SALE", "EBITDA", "CAPX", "NI", "DV", "CEQ", "CEQL", "SEQ", + "TXDITC", "TXP", "TXDB", "ITCB", "DVT", "PSTK","PSTKL", "PSTKRV"] + +# set up the query for msf + postgre_query_funda_full = """ + SELECT * + FROM comp.funda + WHERE INDFMT = 'INDL' AND DATAFMT = 'STD' AND CONSOL = 'C' AND POPSRC = 'D' + AND DATADATE >= '$(string(date_range[1]))' + AND DATADATE <= '$(string(date_range[2]))' + """ + postgre_query_funda_var = """ + SELECT $(join(var_funda, ",")) + FROM comp.funda + WHERE INDFMT = 'INDL' AND DATAFMT = 'STD' AND CONSOL = 'C' AND POPSRC = 'D' + AND DATADATE >= '$(string(date_range[1]))' + AND DATADATE <= '$(string(date_range[2]))' + """ + @time res_q_funda = execute(wrds_conn, postgre_query_funda_var) + df_funda = DataFrame(columntable(res_q_funda)); + + # clean up the dataframe + transform!(df_funda, + names(df_funda, check_integer.(eachcol(df_funda))) .=> (x->convert.(Union{Missing, Int}, x)); + renamecols = false) + df_funda[!, :gvkey] .= parse.(Int, df_funda[!, :gvkey]); + + return df_funda + +end + +function import_Funda(; + date_range::Tuple{Date, Date} = (Date("1900-01-01"), Date("2030-01-01")), + variables::String = "", + user::String = "", password::String = "") + + if user == "" + wrds_conn = open_wrds_pg() + else + wrds_conn = open_wrds_pg(user, password) + end + + import_Funda(wrds_conn, date_range=date_range, variables=variables) +end +# ------------------------------------------------------------------------------------------ + + + + +# ------------------------------------------------------------------------------------------ +function build_Funda(df_funda::DataFrame; + save::String = "" + ) + + # define book equity value + @transform!(df_funda, :be = + coalesce(:seq, :ceq + :pstk, :at - :lt) + coalesce(:txditc, :txdb + :itcb, 0) - + coalesce(:pstkrv, :pstkl, :pstk, 0) ) + df_funda[ isless.(df_funda.be, 0), :be] .= missing; + @rtransform!(df_funda, :date_y = year(:datadate)); + sort!(df_funda, [:gvkey, :date_y, :datadate]) + unique!(df_funda, [:gvkey, :date_y], keep=:last) # last obs + + if !(save == "") + CSV.write(save * "/funda.csv.gz", df_funda, compress=true) + end + + return df_funda +end +# ------------------------------------------------------------------------------------------ + diff --git a/src/ImportFinanceData.jl b/src/ImportFinanceData.jl index d7379f9..1db01e0 100644 --- a/src/ImportFinanceData.jl +++ b/src/ImportFinanceData.jl @@ -15,7 +15,6 @@ # --------------------------------------------------------- function greet_FinanceRoutines() - println("Hello FinanceRoutines!") return "Hello FinanceRoutines!" end # --------------------------------------------------------- @@ -25,31 +24,28 @@ end function import_FF3() url_FF = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip" + ff_col_classes = [Int, Float64, Float64, Float64, Float64]; + row_lim = div(MonthlyDate(Dates.today()) - MonthlyDate(1926, 7), Dates.Month(1)) - 1 http_response = Downloads.download(url_FF); z = ZipFile.Reader(http_response) ; a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1] - df_FF3 = copy(CSV.File(a_file_in_zip, header=3, footerskip=1) |> DataFrame); + df_FF3 = copy( + CSV.File(a_file_in_zip, + skipto=5, header=4, limit=row_lim, delim=",", + types=ff_col_classes) |> + DataFrame); close(z) rename!(df_FF3, [:dateym, :mktrf, :smb, :hml, :rf]); @subset!(df_FF3, .!(ismissing.(:dateym))); @subset!(df_FF3, .!(ismissing.(:mktrf))); - @transform!(df_FF3, :dateym = parse.(Int, :dateym) ) @subset!(df_FF3, :dateym .>= 190000 ) - @transform!(df_FF3, - :date = Date.(div.(:dateym, 100), rem.(:dateym,100) ), - :mktrf = parse.(Float64, :mktrf), - :smb = parse.(Float64, :smb), - :hml = parse.(Float64, :hml), - :rf = parse.(Float64, :rf) ) - + return(df_FF3) end -# --------------------------------------------------------- -# --------------------------------------------------------- """ import_FF3(frequency::Symbol) diff --git a/src/Merge_CRSP_Comp.jl b/src/Merge_CRSP_Comp.jl new file mode 100644 index 0000000..c843ec3 --- /dev/null +++ b/src/Merge_CRSP_Comp.jl @@ -0,0 +1,113 @@ +# ------------------------------------------------------------------------------------------ +# Merge_CRSP_Comp.jl + +# Collection of functions that import +# compustat data into julia +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +# List of exported functions +# export import_MSF +# export build_MSF + +# list +# ------------------------------------------------------------------------------------------ + + + +# ------------------------------------------------------------------------------------------ +function import_ccm_link(wrds_conn::Connection) + +# df_funda = CSV.read("./tmp/funda.csv.gz", DataFrame); +# df_msf = CSV.read("./tmp/msf.csv.gz", DataFrame); + +# Download link table + postgre_query_linktable = """ + SELECT * + FROM crsp.ccmxpf_lnkhist + """ + @time res_q_linktable = execute(wrds_conn, postgre_query_linktable) + + df_linktable = DataFrame(columntable(res_q_linktable)) + transform!(df_linktable, names(df_linktable, check_integer.(eachcol(df_linktable))) .=> + (x->convert.(Union{Missing, Int}, x)); + renamecols = false); + @rtransform!(df_linktable, :gvkey = parse(Int, :gvkey) ); + @rtransform!(df_linktable, :linkprim = String3(:linkprim), + :liid = String3(:liid), :linktype = String3(:linktype)); + +# Prepare the table + @rsubset!(df_linktable, + :linktype ∈ ("LU", "LC", "LS"), :linkprim ∈ ("P", "C") ) + # @rsubset(df_linktable, !ismissing(:lpermno)) + df_linktable[ ismissing.(df_linktable.linkenddt), :linkenddt ] .= Dates.today(); + disallowmissing!(df_linktable, [:linkdt, :linkenddt, :lpermno]); + rename!(df_linktable, :lpermno => :permno); + + return df_linktable +end + + +# when there are no connections establisheds +function import_ccm_link(; + user::String = "", password::String = "") + + if user == "" + wrds_conn = open_wrds_pg() + else + wrds_conn = open_wrds_pg(user, password) + end + + return import_ccm_link(wrds_conn) +end +# ------------------------------------------------------------------------------------------ + + + +# ------------------------------------------------------------------------------------------ +function link_Funda(df_linktable::DataFrame, df_funda::DataFrame) + + funda_link_permno = innerjoin( + (select(df_funda, :gvkey, :datadate), df_linktable), + by_key(:gvkey) & by_pred(:datadate, ∈, x->x.linkdt..x.linkenddt) ) + select!(funda_link_permno, + Not([:gvkey_1, :linkprim, :liid, :linktype, :linkdt, :linkenddt]) ) + + return funda_link_permno + +end + + +function link_MSF(df_linktable::DataFrame, df_msf::DataFrame) +# Merge with CRSP + df_msf_linked = innerjoin( + (df_msf, df_linktable), + by_key(:permno) & by_pred(:date, ∈, x->x.linkdt..x.linkenddt) + ) + @rsubset!(df_msf_linked, !ismissing(:gvkey)) + select!(df_msf_linked, :date, :permno, :gvkey) +# merge this back + df_msf_merged = leftjoin(df_msf, df_msf_linked, on = [:date, :permno], source="_merge") + @rtransform!(df_msf_merged, :date_y = year(:date)); + select!(df_msf_merged, Not(:_merge)) + + return df_msf_merged +end + + + + +function link_ccm(df_linktable, df_msf, df_funda) + +# ccm + df_ccm = leftjoin( + df_msf_merged, df_funda, + on = [:gvkey, :date_y], matchmissing = :notequal) + + if save + CSV.write("./tmp/ccm.csv.gz", df_ccm, compress=true) + end + +end +# ------------------------------------------------------------------------------------------ \ No newline at end of file diff --git a/src/Utilities.jl b/src/Utilities.jl new file mode 100644 index 0000000..805401e --- /dev/null +++ b/src/Utilities.jl @@ -0,0 +1,41 @@ + +# ------------------------------------------------------------------------------------------ +function check_integer(x::AbstractVector) + for i in x + !(typeof(i) <: Union{Missing, Number}) && return false + ismissing(i) && continue + isinteger(i) && continue + return false + end + return true +end +# ------------------------------------------------------------------------------------------ + + +# ------------------------------------------------------------------------------------------ +""" + Open a postgresql connection on WRDS server +""" +function open_wrds_pg(user::String, password::String) + wrds_conn = Connection( + """ + host = wrds-pgdata.wharton.upenn.edu + port = 9737 + user='$user' + password='$password' + sslmode = 'require' dbname = wrds + """ + ) + return wrds_conn +end + +function open_wrds_pg() + # prompt to input + print("Enter WRDS username: ... ") + # Calling rdeadline() function + user = readline() + print("Enter WRDS password: ... ") + password = readline() + return open_wrds_pg(user, password); +end +# ------------------------------------------------------------------------------------------ diff --git a/test/runtests.jl b/test/runtests.jl index c2f1fe7..39f5882 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,12 @@ +# --------------------------------------------------------- using FinanceRoutines using Test +import DataFrames: DataFrame, nrow, rename! +# --------------------------------------------------------- + + +# --------------------------------------------------------- @testset "FinanceRoutines.jl" begin # Write your tests here. @@ -8,4 +14,15 @@ using Test @test FinanceRoutines.greet_FinanceRoutines() == "Hello FinanceRoutines!" @test FinanceRoutines.greet_FinanceRoutines() != "Hello world!" + # import_FF3 + df_FF3 = FinanceRoutines.import_FF3(); + @test names(df_FF3) == ["dateym", "mktrf", "smb", "hml", "rf"]; + @test (nrow(df_FF3) >= 1000 & nrow(df_FF3) <= 1250); + df_FF3_daily = FinanceRoutines.import_FF3(:daily); + @test names(df_FF3_daily) == ["dateymd", "mktrf", "smb", "hml", "rf", "date"] + @test (nrow(df_FF3_daily) >= 25_000 & nrow(df_FF3_daily) <= 26_000) + + # build_crsp + end +# --------------------------------------------------------- \ No newline at end of file