Python scripting to automate fetching data from RDAS.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

main.py 7.8KB

il y a 4 ans
il y a 4 ans
il y a 4 ans
il y a 4 ans
il y a 4 ans
il y a 4 ans
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. import requests, time, os, sys, json, csv
  2. from bs4 import BeautifulSoup
  3. class DataTable:
  4. def __init__(self):
  5. self.table = {}
  6. self.cols = {}
  7. self.rows = {}
  8. class DataPocket:
  9. def __init__(self):
  10. self.rows = {}
  11. def addRow(self, rownum, name=None, values=None):
  12. ### values should be a list in column order
  13. rownum = int(rownum)
  14. d = {'name': name,
  15. 'values': [] if values == None else [x for x in values]
  16. }
  17. self.rows[rownum] = d
  18. def addName(self, rownum, name):
  19. if int(rownum) in self.rows.keys():
  20. self.rows[rownum]['name'] = name
  21. else:
  22. print("No such row.")
  23. def addValues(self, rownum, values):
  24. ### values should be a list in column order
  25. if int(rownum) in self.rows.keys():
  26. self.rows[rownum]['values'] = [x for x in values]
  27. else:
  28. print("No such row.")
  29. def setValue(self, rownum, value, col):
  30. ### col should be a zero-indexed integer
  31. col = int(col)
  32. if int(rownum) in self.rows.keys():
  33. try:
  34. self.rows[rownum]['values'][col] = value
  35. except IndexError:
  36. while len(self.rows[rownum]['values'] < col):
  37. self.rows[rownum]['values'].append(None)
  38. self.rows[rownum]['values'].append(value)
  39. else:
  40. print("No such row.")
  41. def printRow(self, rownum):
  42. if int(rownum) in self.rows.keys():
  43. print("Row {} (\"{}\"): {}".format(rownum, self.rows[rownum]['name'], self.rows[rownum]['values']))
  44. else:
  45. print("No such row.")
  46. def printData(self):
  47. for row in self.rows.keys():
  48. self.printRow(row)
  49. class DataColl:
  50. def __init__(self):
  51. self._year = None
  52. self._row = None
  53. self._col = None
  54. self._control = None
  55. self._cf_eq = None
  56. self._control_filter = None
  57. self._weight = None
  58. self._chisq = None
  59. self._fmt = None
  60. @property
  61. def year(self):
  62. return self._year
  63. @year.setter
  64. def year(self, year):
  65. # if int(year) < 2014:
  66. # print("NSDUH does not have data from before 2014. Setting to 2014.")
  67. # self._year = 2014
  68. self._year = int(year)
  69. self._generate()
  70. @property
  71. def row(self):
  72. return self._row
  73. @row.setter
  74. def row(self, r):
  75. self._row = r
  76. self._generate()
  77. @property
  78. def col(self):
  79. return self._col
  80. @col.setter
  81. def col(self, c):
  82. self._col = c
  83. self._generate()
  84. @property
  85. def control(self):
  86. return self._control
  87. @control.setter
  88. def control(self, ctl):
  89. self._control = ctl
  90. self._generate()
  91. @property
  92. def cf_eq(self):
  93. return self._cf_eq
  94. @cf_eq.setter
  95. def cf_eq(self, eq):
  96. self._cf_eq = eq
  97. self._generate()
  98. @property
  99. def control_filter(self):
  100. return self._control_filter
  101. @control_filter.setter
  102. def control_filter(self, filter):
  103. self._control_filter = filter
  104. self._generate()
  105. @property
  106. def weight(self):
  107. return self._weight
  108. @weight.setter
  109. def weight(self, wgt):
  110. self._weight = wgt
  111. self._generate()
  112. @property
  113. def chisq(self):
  114. return self._chisq
  115. @chisq.setter
  116. def chisq(self, cs):
  117. self._chisq = cs
  118. self._generate()
  119. @property
  120. def fmt(self):
  121. return self._fmt
  122. @fmt.setter
  123. def fmt(self, f):
  124. self._fmt = f
  125. self._generate()
  126. @property
  127. def endpoint(self):
  128. return self._endpoint
  129. def _generate(self):
  130. self.setEndpoint(False)
  131. self.setFilename(False)
  132. def setEndpoint(self, loud=True):
  133. if loud and self._year == None or self._row == None or self._col == None:
  134. print("Year, row, and column must be set in order to build an endpoint.")
  135. self._endpoint = None
  136. lcontrol = "" if self.control == None else "&control={}".format(self.control)
  137. lcfnot = "" if self.cf_eq == True else "!"
  138. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "&filter={}{}%3D{}".format(self.control, lcfnot, self.control_filter)
  139. lweight = "" if self.weight == None else "&weight={}".format(self.weight)
  140. lchisq = "&run_chisq=false" if self.chisq == False or self.chisq == None else "&run_chisq=true"
  141. lfmt = "json" if (self.fmt == None or self.fmt not in ["json", "msgpack", "api"]) else self.fmt
  142. year_rng = "{}-{}".format(int(self.year), int(self.year)+1)
  143. self._endpoint = "https://rdas.samhsa.gov/api/surveys/NSDUH-{}-RD02YR/crosstab/?row={}&column={}{}{}{}{}&format={}".format(
  144. year_rng,
  145. self.row,
  146. self.col,
  147. lcontrol,
  148. lcontrol_filter,
  149. lweight,
  150. lchisq,
  151. lfmt
  152. )
  153. @property
  154. def filename(self):
  155. return self._filename
  156. def setFilename(self, loud=True, ext=None):
  157. if loud and self._year == None or self._row == None or self._col == None:
  158. print("Year, row, and column must be set in order to build a filename.")
  159. self._filename = None
  160. lext = "csv" if ext == None else ext
  161. lcontrol = "" if self.control == None else "_ctl_{}".format(self.control)
  162. lcfnot = "" if self.cf_eq == True else "n"
  163. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "_{}eq_{}".format(lcfnot, self.control_filter)
  164. lweight = "" if self.weight == None else "_weight_{}".format(self.weight)
  165. lchisq = "_chisq_false" if self.chisq == False or self.chisq == None else "_chisq_true"
  166. self._filename = "NSDUH_{}_{}_vs_{}{}{}{}{}.{}".format(
  167. self.year,
  168. self.row,
  169. self.col,
  170. lcontrol,
  171. lcontrol_filter,
  172. lweight,
  173. lchisq,
  174. lext
  175. )
  176. def toString(self):
  177. print("Data Collector:")
  178. print("Row: {}, Column: {}".format(self.row, self.col))
  179. print("Controlling on {} {}= {}".format(self.control, self.control_filter[0], self.control_filter[1]))
  180. print("Weighted by {}".format(self.weight))
  181. print("Generating Chi-Squared" if self.chisq else "Not generating Chi-Squared")
  182. print("Formatting as {}".format(self.fmt))
  183. print("URL: {}".format(self.endpoint))
  184. def main():
  185. df = DataColl()
  186. df.year = 2017
  187. df.row = "STATE"
  188. df.col = "YOSELL2"
  189. df.control = "CATAG18"
  190. df.cf_eq = True
  191. df.control_filter = "2"
  192. df.weight = "DASWT_1"
  193. df.chisq = False
  194. df.fmt = "json"
  195. print(df.endpoint)
  196. df.setFilename(False, "csv")
  197. r = requests.get(df.endpoint)
  198. rjson = r.json() # this creates a Python object, not a JSON string
  199. # in results/cells, column_option and row_option refer to the key field, not the list index
  200. # in column_option and row_option, missing value is coded as "", but coded as "." in key field
  201. # What are the values of "column_option" and "row_option"?
  202. # rc = rjson["results"]["cells"]
  203. # for i in range(10):
  204. # print(rc[i])
  205. # cols = []
  206. # rows = []
  207. # for v in rc:
  208. # print(v.keys())
  209. # if v["column_option"] not in cols:
  210. # cols.append(v["column_option"])
  211. # if v["row_option"] not in rows:
  212. # rows.append(v["row_option"])
  213. # print(f"Cols: {sorted(cols)}")
  214. # print(f"Rows: {sorted(rows)}")
  215. # This just confirms that there are, in fact, 51 states incl. DC in this query
  216. # ro = rjson["results"]["row"]["options"]
  217. # titles = {}
  218. # for v in ro:
  219. # titles[int(v["key"])] = (v["title"])
  220. # for t in sorted(titles.keys()):
  221. # print(f"{titles[t]}")
  222. if __name__ == "__main__":
  223. main()